9#ifndef __ARM_FEATURE_SVE
10#pragma message "Requested to compile LJFunctorSVE but SVE is not available!"
43template <
class Particle_T,
bool applyShift =
false,
bool useMixing =
false,
45 bool countFLOPs =
false,
bool relevantForTuning =
true>
48 calculateGlobals, countFLOPs, relevantForTuning>> {
49 using SoAArraysType =
typename Particle_T::SoAArraysType;
64#ifdef __ARM_FEATURE_SVE
66 calculateGlobals, countFLOPs, relevantForTuning>>(cutoff),
67 _cutoffSquared{cutoff * cutoff},
68 _cutoffSquaredAoS(cutoff * cutoff),
69 _potentialEnergySum{0.},
70 _virialSum{0., 0., 0.},
72 _postProcessed{
false} {
73 if (calculateGlobals) {
76 if constexpr (countFLOPs) {
77 AutoPasLog(DEBUG,
"Using LJFunctorSVE with countFLOPs but FLOP counting is not implemented.");
82 calculateGlobals, countFLOPs, relevantForTuning>>(cutoff) {
96 static_assert(not useMixing,
97 "Mixing without a ParticlePropertiesLibrary is not possible! Use a different constructor or set "
109 static_assert(useMixing,
110 "Not using Mixing but using a ParticlePropertiesLibrary is not allowed! Use a different constructor "
111 "or set mixing to true.");
112 _PPLibrary = &particlePropertiesLibrary;
115 std::string
getName() final {
return "LJFunctorSVE"; }
120 return useNewton3 == autopas::FunctorN3Modes::Newton3Only or useNewton3 == autopas::FunctorN3Modes::Both;
124 return useNewton3 == autopas::FunctorN3Modes::Newton3Off or useNewton3 == autopas::FunctorN3Modes::Both;
127 void AoSFunctor(Particle_T &i, Particle_T &j,
bool newton3)
final {
128 using namespace autopas::utils::ArrayMath::literals;
130 if (i.isDummy() or j.isDummy()) {
133 auto sigmaSquared = _sigmaSquaredAoS;
134 auto epsilon24 = _epsilon24AoS;
135 auto shift6 = _shift6AoS;
136 if constexpr (useMixing) {
139 if constexpr (applyShift) {
143 auto dr = i.getR() - j.getR();
146 if (dr2 > _cutoffSquaredAoS) {
150 double invdr2 = 1. / dr2;
151 double lj6 = sigmaSquared * invdr2;
152 lj6 = lj6 * lj6 * lj6;
153 double lj12 = lj6 * lj6;
154 double lj12m6 = lj12 - lj6;
155 double fac = epsilon24 * (lj12 + lj12m6) * invdr2;
162 if (calculateGlobals) {
166 auto virial = dr * f;
167 double potentialEnergy6 = epsilon24 * lj12m6 + shift6;
171 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
172 _aosThreadData[threadnum].virialSum += virial;
175 if (newton3 and j.isOwned()) {
176 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
177 _aosThreadData[threadnum].virialSum += virial;
189 SoAFunctorSingleImpl<true>(soa);
191 SoAFunctorSingleImpl<false>(soa);
201 const bool newton3)
final {
203 SoAFunctorPairImpl<true>(soa1, soa2);
205 SoAFunctorPairImpl<false>(soa1, soa2);
215 template <
bool newton3>
217#ifdef __ARM_FEATURE_SVE
218 if (soa.
size() == 0)
return;
220 const auto *
const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
221 const auto *
const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
222 const auto *
const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
224 const auto *
const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
226 auto *
const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
227 auto *
const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
228 auto *
const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
230 const auto *
const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
232 svfloat64_t virialSumX = svdup_f64(0.0);
233 svfloat64_t virialSumY = svdup_f64(0.0);
234 svfloat64_t virialSumZ = svdup_f64(0.0);
235 svfloat64_t potentialEnergySum = svdup_f64(0.0);
237 const auto vecLength = (size_t)svlen_f64(potentialEnergySum);
241 for (
size_t i = soa.
size() - 1; (long)i >= 0; --i) {
247 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
248 "OwnershipStates underlying type should be int64_t!");
250 svfloat64_t fxacc = svdup_f64(0.0);
251 svfloat64_t fyacc = svdup_f64(0.0);
252 svfloat64_t fzacc = svdup_f64(0.0);
254 const svfloat64_t x1 = svdup_f64(xptr[i]);
255 const svfloat64_t y1 = svdup_f64(yptr[i]);
256 const svfloat64_t z1 = svdup_f64(zptr[i]);
258 svbool_t pg_1, pg_2, pg_3, pg_4;
260 for (; j < i; j += vecLength * 4) {
261 const size_t j_2 = j + vecLength;
262 const size_t j_3 = j_2 + vecLength;
263 const size_t j_4 = j_3 + vecLength;
264 pg_1 = svwhilelt_b64(j, i);
265 pg_2 = svwhilelt_b64(j_2, i);
266 pg_3 = svwhilelt_b64(j_3, i);
267 pg_4 = svwhilelt_b64(j_4, i);
270 reinterpret_cast<const int64_t *
>(ownedStatePtr), x1, y1, z1, xptr, yptr, zptr, fxptr,
271 fyptr, fzptr, &typeIDptr[i], typeIDptr, fxacc, fyacc, fzacc, virialSumX, virialSumY,
272 virialSumZ, potentialEnergySum, pg_1, svundef_u64(), pg_2, svundef_u64(), pg_3,
273 svundef_u64(), pg_4, svundef_u64());
276 fxptr[i] += svaddv(svptrue_b64(), fxacc);
277 fyptr[i] += svaddv(svptrue_b64(), fyacc);
278 fzptr[i] += svaddv(svptrue_b64(), fzacc);
281 if constexpr (calculateGlobals) {
284 _aosThreadData[threadnum].potentialEnergySum += svaddv_f64(svptrue_b64(), potentialEnergySum);
285 _aosThreadData[threadnum].virialSum[0] += svaddv_f64(svptrue_b64(), virialSumX);
286 _aosThreadData[threadnum].virialSum[1] += svaddv_f64(svptrue_b64(), virialSumY);
287 _aosThreadData[threadnum].virialSum[2] += svaddv_f64(svptrue_b64(), virialSumZ);
292 template <
bool newton3>
294#ifdef __ARM_FEATURE_SVE
295 if (soa1.
size() == 0 || soa2.
size() == 0)
return;
297 const auto *
const __restrict x1ptr = soa1.template begin<Particle_T::AttributeNames::posX>();
298 const auto *
const __restrict y1ptr = soa1.template begin<Particle_T::AttributeNames::posY>();
299 const auto *
const __restrict z1ptr = soa1.template begin<Particle_T::AttributeNames::posZ>();
300 const auto *
const __restrict x2ptr = soa2.template begin<Particle_T::AttributeNames::posX>();
301 const auto *
const __restrict y2ptr = soa2.template begin<Particle_T::AttributeNames::posY>();
302 const auto *
const __restrict z2ptr = soa2.template begin<Particle_T::AttributeNames::posZ>();
304 const auto *
const __restrict ownedStatePtr1 = soa1.template begin<Particle_T::AttributeNames::ownershipState>();
305 const auto *
const __restrict ownedStatePtr2 = soa2.template begin<Particle_T::AttributeNames::ownershipState>();
307 auto *
const __restrict fx1ptr = soa1.template begin<Particle_T::AttributeNames::forceX>();
308 auto *
const __restrict fy1ptr = soa1.template begin<Particle_T::AttributeNames::forceY>();
309 auto *
const __restrict fz1ptr = soa1.template begin<Particle_T::AttributeNames::forceZ>();
310 auto *
const __restrict fx2ptr = soa2.template begin<Particle_T::AttributeNames::forceX>();
311 auto *
const __restrict fy2ptr = soa2.template begin<Particle_T::AttributeNames::forceY>();
312 auto *
const __restrict fz2ptr = soa2.template begin<Particle_T::AttributeNames::forceZ>();
314 const auto *
const __restrict typeID1ptr = soa1.template begin<Particle_T::AttributeNames::typeId>();
315 const auto *
const __restrict typeID2ptr = soa2.template begin<Particle_T::AttributeNames::typeId>();
317 svfloat64_t virialSumX = svdup_f64(0.0);
318 svfloat64_t virialSumY = svdup_f64(0.0);
319 svfloat64_t virialSumZ = svdup_f64(0.0);
320 svfloat64_t potentialEnergySum = svdup_f64(0.0);
322 const auto vecLength = (
unsigned int)svlen_f64(potentialEnergySum);
324 for (
unsigned int i = 0; i < soa1.
size(); ++i) {
330 svfloat64_t fxacc = svdup_f64(0.0);
331 svfloat64_t fyacc = svdup_f64(0.0);
332 svfloat64_t fzacc = svdup_f64(0.0);
334 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
335 "OwnershipStates underlying type should be int64_t!");
337 const svfloat64_t x1 = svdup_f64(x1ptr[i]);
338 const svfloat64_t y1 = svdup_f64(y1ptr[i]);
339 const svfloat64_t z1 = svdup_f64(z1ptr[i]);
341 svbool_t pg_1, pg_2, pg_3, pg_4;
343 for (; j < soa2.
size(); j += vecLength * 4) {
344 const unsigned int j_2 = j + vecLength;
345 const unsigned int j_3 = j_2 + vecLength;
346 const unsigned int j_4 = j_3 + vecLength;
347 pg_1 = svwhilelt_b64(j, (
unsigned int)soa2.
size());
348 pg_2 = svwhilelt_b64(j_2, (
unsigned int)soa2.
size());
349 pg_3 = svwhilelt_b64(j_3, (
unsigned int)soa2.
size());
350 pg_4 = svwhilelt_b64(j_4, (
unsigned int)soa2.
size());
353 reinterpret_cast<const int64_t *
>(ownedStatePtr2), x1, y1, z1, x2ptr, y2ptr, z2ptr,
354 fx2ptr, fy2ptr, fz2ptr, typeID1ptr, typeID2ptr, fxacc, fyacc, fzacc, virialSumX,
355 virialSumY, virialSumZ, potentialEnergySum, pg_1, svundef_u64(), pg_2, svundef_u64(),
356 pg_3, svundef_u64(), pg_4, svundef_u64());
359 fx1ptr[i] += svaddv_f64(svptrue_b64(), fxacc);
360 fy1ptr[i] += svaddv_f64(svptrue_b64(), fyacc);
361 fz1ptr[i] += svaddv_f64(svptrue_b64(), fzacc);
364 if constexpr (calculateGlobals) {
367 _aosThreadData[threadnum].potentialEnergySum += svaddv_f64(svptrue_b64(), potentialEnergySum);
368 _aosThreadData[threadnum].virialSum[0] += svaddv_f64(svptrue_b64(), virialSumX);
369 _aosThreadData[threadnum].virialSum[1] += svaddv_f64(svptrue_b64(), virialSumY);
370 _aosThreadData[threadnum].virialSum[2] += svaddv_f64(svptrue_b64(), virialSumZ);
374#ifdef __ARM_FEATURE_SVE
375 template <
bool indexed>
376 inline svbool_t distCalc(
const size_t j,
const svuint64_t &index,
const svfloat64_t &x1,
const svfloat64_t &y1,
377 const svfloat64_t &z1,
const svbool_t &pg,
const int64_t *
const __restrict ownedStatePtr2,
378 const double *
const __restrict x2ptr,
const double *
const __restrict y2ptr,
379 const double *
const __restrict z2ptr, svfloat64_t &drx, svfloat64_t &dry, svfloat64_t &drz,
380 svfloat64_t &dr2, svint64_t &ownedStateJ) {
381 const svfloat64_t x2 = (indexed) ? svld1_gather_index(pg, x2ptr, index) : svld1(pg, &x2ptr[j]);
382 const svfloat64_t y2 = (indexed) ? svld1_gather_index(pg, y2ptr, index) : svld1(pg, &y2ptr[j]);
383 const svfloat64_t z2 = (indexed) ? svld1_gather_index(pg, z2ptr, index) : svld1(pg, &z2ptr[j]);
386 drx = svsub_m(pg, x1, x2);
387 dry = svsub_m(pg, y1, y2);
388 drz = svsub_m(pg, z1, z2);
390 const svfloat64_t dr2_1 = svmul_x(pg, drx, drx);
391 const svfloat64_t dr2_2 = svmla_x(pg, dr2_1, dry, dry);
392 dr2 = svmla_x(pg, dr2_2, drz, drz);
394 const svbool_t cutoffMask = svcmple(pg, dr2, _cutoffSquared);
396 ownedStateJ = (indexed) ? svld1_gather_index(pg, ownedStatePtr2, index) : svld1(pg, &ownedStatePtr2[j]);
398 return svand_z(pg, cutoffMask, dummyMask);
401 template <
bool indexed>
402 inline void lennardJones(
const svuint64_t &index,
const size_t *
const typeID1ptr,
const size_t *
const typeID2ptr,
403 const svbool_t &pgC,
const svfloat64_t &dr2, svfloat64_t &epsilon24s, svfloat64_t &shift6s,
404 svfloat64_t &lj6, svfloat64_t &fac) {
405 const svuint64_t typeIds =
406 useMixing ? svmul_m(pgC, (indexed) ? svld1_gather_index(pgC, typeID2ptr, index) : svld1_u64(pgC, typeID2ptr), 3)
408 const auto mixingDataPtr = useMixing ? _PPLibrary->
getLJMixingDataPtr(*typeID1ptr, 0) :
nullptr;
410 const svfloat64_t sigmaSquareds =
411 useMixing ? svld1_gather_index(pgC, mixingDataPtr + 1, typeIds) : svdup_f64(_sigmaSquared);
412 epsilon24s = useMixing ? svld1_gather_index(pgC, mixingDataPtr, typeIds) : svdup_f64(_epsilon24);
413 shift6s = (useMixing && applyShift) ? svld1_gather_index(pgC, mixingDataPtr + 2, typeIds) : svdup_f64(_shift6);
415 svfloat64_t invdr2 = svrecpe(dr2);
416 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
417 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
418 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
419 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
420 const svfloat64_t lj2 = svmul_x(pgC, sigmaSquareds, invdr2);
422 const svfloat64_t lj4 = svmul_x(pgC, lj2, lj2);
423 lj6 = svmul_x(pgC, lj2, lj4);
424 const svfloat64_t lj12 = svmul_x(pgC, lj6, lj6);
428 const svfloat64_t lj12m6alj12 = svnmls_x(pgC, lj6, lj12, 2);
430 const svfloat64_t lj12m6alj12e = svmul_x(pgC, lj12m6alj12, epsilon24s);
431 fac = svmul_x(pgC, lj12m6alj12e, invdr2);
434 template <
bool newton3,
bool indexed>
435 inline void applyForces(
const size_t j,
const svuint64_t &index,
const bool ownedStateIisOwned,
436 double *
const __restrict fx2ptr,
double *
const __restrict fy2ptr,
437 double *
const __restrict fz2ptr, svfloat64_t &fxacc, svfloat64_t &fyacc, svfloat64_t &fzacc,
438 svfloat64_t &virialSumX, svfloat64_t &virialSumY, svfloat64_t &virialSumZ,
439 svfloat64_t &potentialEnergySum,
const svfloat64_t &drx,
const svfloat64_t &dry,
440 const svfloat64_t &drz,
const double *
const __restrict x2ptr,
441 const double *
const __restrict y2ptr,
const double *
const __restrict z2ptr,
442 const svint64_t &ownedStateJ,
const svbool_t &pgC,
const svfloat64_t &epsilon24s,
443 const svfloat64_t &shift6s,
const svfloat64_t &lj6,
const svfloat64_t &fac) {
444 const svfloat64_t fx = svmul_x(pgC, drx, fac);
445 const svfloat64_t fy = svmul_x(pgC, dry, fac);
446 const svfloat64_t fz = svmul_x(pgC, drz, fac);
448 fxacc = svadd_f64_m(pgC, fxacc, fx);
449 fyacc = svadd_f64_m(pgC, fyacc, fy);
450 fzacc = svadd_f64_m(pgC, fzacc, fz);
453 const svfloat64_t fx2 = (indexed) ? svld1_gather_index(pgC, fx2ptr, index) : svld1_f64(pgC, &fx2ptr[j]);
454 const svfloat64_t fy2 = (indexed) ? svld1_gather_index(pgC, fy2ptr, index) : svld1_f64(pgC, &fy2ptr[j]);
455 const svfloat64_t fz2 = (indexed) ? svld1_gather_index(pgC, fz2ptr, index) : svld1_f64(pgC, &fz2ptr[j]);
457 const svfloat64_t fx2new = svsub_x(pgC, fx2, fx);
458 const svfloat64_t fy2new = svsub_x(pgC, fy2, fy);
459 const svfloat64_t fz2new = svsub_x(pgC, fz2, fz);
461 if constexpr (indexed) {
462 svst1_scatter_index(pgC, &fx2ptr[0], index, fx2new);
463 svst1_scatter_index(pgC, &fy2ptr[0], index, fy2new);
464 svst1_scatter_index(pgC, &fz2ptr[0], index, fz2new);
466 svst1(pgC, &fx2ptr[j], fx2new);
467 svst1(pgC, &fy2ptr[j], fy2new);
468 svst1(pgC, &fz2ptr[j], fz2new);
472 if (calculateGlobals) {
474 const svfloat64_t virialX = svmul_x(pgC, fx, drx);
475 const svfloat64_t virialY = svmul_x(pgC, fy, dry);
476 const svfloat64_t virialZ = svmul_x(pgC, fz, drz);
479 const svfloat64_t lj12m6 = svnmls_x(pgC, lj6, lj6, lj6);
480 const svfloat64_t potentialEnergy6 = svmad_x(pgC, epsilon24s, lj12m6, shift6s);
481 svfloat64_t energyFactor = svdup_f64(ownedStateIisOwned ? 1.0 : 0.0);
483 if constexpr (newton3) {
485 energyFactor = svadd_m(ownedMaskJ, energyFactor, 1.0);
487 potentialEnergySum = svmla_m(pgC, potentialEnergySum, energyFactor, potentialEnergy6);
488 virialSumX = svmla_m(pgC, virialSumX, energyFactor, virialX);
489 virialSumY = svmla_m(pgC, virialSumY, energyFactor, virialY);
490 virialSumZ = svmla_m(pgC, virialSumZ, energyFactor, virialZ);
494 template <
bool newton3,
bool indexed>
496 __attribute__((always_inline))
inline void SoAKernel(
497 const size_t j,
const bool ownedStateIisOwned,
const int64_t *
const __restrict ownedStatePtr2,
498 const svfloat64_t &x1,
const svfloat64_t &y1,
const svfloat64_t &z1,
const double *
const __restrict x2ptr,
499 const double *
const __restrict y2ptr,
const double *
const __restrict z2ptr,
double *
const __restrict fx2ptr,
500 double *
const __restrict fy2ptr,
double *
const __restrict fz2ptr,
const size_t *
const typeID1ptr,
501 const size_t *
const typeID2ptr, svfloat64_t &fxacc, svfloat64_t &fyacc, svfloat64_t &fzacc,
502 svfloat64_t &virialSumX, svfloat64_t &virialSumY, svfloat64_t &virialSumZ, svfloat64_t &potentialEnergySum,
504 const svbool_t &pg_1,
const svuint64_t &index_1,
const svbool_t &pg_2,
const svuint64_t &index_2,
505 const svbool_t &pg_3,
const svuint64_t &index_3,
const svbool_t &pg_4,
const svuint64_t &index_4
512 svint64_t ownedStateJ_1;
513 const svbool_t pgC_1 = distCalc<indexed>(j, index_1, x1, y1, z1, pg_1, ownedStatePtr2, x2ptr, y2ptr, z2ptr, drx_1,
514 dry_1, drz_1, dr2_1, ownedStateJ_1);
520 svint64_t ownedStateJ_2;
521 const svbool_t pgC_2 = distCalc<indexed>(j + svlen(x1), index_2, x1, y1, z1, pg_2, ownedStatePtr2, x2ptr, y2ptr,
522 z2ptr, drx_2, dry_2, drz_2, dr2_2, ownedStateJ_2);
528 svint64_t ownedStateJ_3;
529 const svbool_t pgC_3 = distCalc<indexed>(j + svlen(x1) * 2, index_3, x1, y1, z1, pg_3, ownedStatePtr2, x2ptr, y2ptr,
530 z2ptr, drx_3, dry_3, drz_3, dr2_3, ownedStateJ_3);
536 svint64_t ownedStateJ_4;
537 const svbool_t pgC_4 = distCalc<indexed>(j + svlen(x1) * 3, index_4, x1, y1, z1, pg_4, ownedStatePtr2, x2ptr, y2ptr,
538 z2ptr, drx_4, dry_4, drz_4, dr2_4, ownedStateJ_4);
540 const bool continue_1 = svptest_any(svptrue_b64(), pgC_1);
541 const bool continue_2 = svptest_any(svptrue_b64(), pgC_2);
542 const bool continue_3 = svptest_any(svptrue_b64(), pgC_3);
543 const bool continue_4 = svptest_any(svptrue_b64(), pgC_4);
545 svfloat64_t epsilon24s_1;
546 svfloat64_t shift6s_1;
550 lennardJones<indexed>(index_1, typeID1ptr, typeID2ptr, pgC_1, dr2_1, epsilon24s_1, shift6s_1, lj6_1, fac_1);
552 svfloat64_t epsilon24s_2;
553 svfloat64_t shift6s_2;
557 lennardJones<indexed>(index_2, typeID1ptr, typeID2ptr, pgC_2, dr2_2, epsilon24s_2, shift6s_2, lj6_2, fac_2);
559 svfloat64_t epsilon24s_3;
560 svfloat64_t shift6s_3;
564 lennardJones<indexed>(index_3, typeID1ptr, typeID2ptr, pgC_3, dr2_3, epsilon24s_3, shift6s_3, lj6_3, fac_3);
566 svfloat64_t epsilon24s_4;
567 svfloat64_t shift6s_4;
571 lennardJones<indexed>(index_4, typeID1ptr, typeID2ptr, pgC_4, dr2_4, epsilon24s_4, shift6s_4, lj6_4, fac_4);
574 applyForces<newton3, indexed>(j, index_1, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc, fyacc, fzacc,
575 virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_1, dry_1, drz_1, x2ptr,
576 y2ptr, z2ptr, ownedStateJ_1, pgC_1, epsilon24s_1, shift6s_1, lj6_1, fac_1);
578 applyForces<newton3, indexed>(j + svlen(x1), index_2, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc, fyacc,
579 fzacc, virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_2, dry_2, drz_2,
580 x2ptr, y2ptr, z2ptr, ownedStateJ_2, pgC_2, epsilon24s_2, shift6s_2, lj6_2, fac_2);
583 applyForces<newton3, indexed>(j + svlen(x1) * 2, index_3, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc,
584 fyacc, fzacc, virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_3, dry_3,
585 drz_3, x2ptr, y2ptr, z2ptr, ownedStateJ_3, pgC_3, epsilon24s_3, shift6s_3, lj6_3,
589 applyForces<newton3, indexed>(j + svlen(x1) * 3, index_4, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc,
590 fyacc, fzacc, virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_4, dry_4,
591 drz_4, x2ptr, y2ptr, z2ptr, ownedStateJ_4, pgC_4, epsilon24s_4, shift6s_4, lj6_4,
606 bool newton3)
final {
607 if (soa.
size() == 0 or neighborList.empty())
return;
609 SoAFunctorVerletImpl<true>(soa, indexFirst, neighborList);
611 SoAFunctorVerletImpl<false>(soa, indexFirst, neighborList);
616 template <
bool newton3>
619#ifdef __ARM_FEATURE_SVE
620 const auto *
const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
624 const auto *
const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
625 const auto *
const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
626 const auto *
const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
628 auto *
const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
629 auto *
const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
630 auto *
const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
632 const auto *
const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
635 svfloat64_t virialSumX = svdup_f64(0.0);
636 svfloat64_t virialSumY = svdup_f64(0.0);
637 svfloat64_t virialSumZ = svdup_f64(0.0);
638 svfloat64_t potentialEnergySum = svdup_f64(0.0);
640 svfloat64_t fxacc = svdup_f64(0.0);
641 svfloat64_t fyacc = svdup_f64(0.0);
642 svfloat64_t fzacc = svdup_f64(0.0);
645 const auto x1 = svdup_f64(xptr[indexFirst]);
646 const auto y1 = svdup_f64(yptr[indexFirst]);
647 const auto z1 = svdup_f64(zptr[indexFirst]);
650 const auto *
const ownedStatePtr2 =
reinterpret_cast<const int64_t *
>(ownedStatePtr);
652 for (; j < neighborList.size(); j += svlen(x1)) {
653 pg_1 = svwhilelt_b64(j, neighborList.size());
654 const svuint64_t index_1 = svld1(pg_1, &neighborList[j]);
660 svint64_t ownedStateJ_1;
661 const svbool_t pgC_1 = distCalc<true>(0, index_1, x1, y1, z1, pg_1, ownedStatePtr2, xptr, yptr, zptr, drx_1,
662 dry_1, drz_1, dr2_1, ownedStateJ_1);
664 const bool continue_1 = svptest_any(svptrue_b64(), pgC_1);
665 svfloat64_t epsilon24s_1;
666 svfloat64_t shift6s_1;
670 lennardJones<true>(index_1, typeIDptr, typeIDptr, pgC_1, dr2_1, epsilon24s_1, shift6s_1, lj6_1, fac_1);
674 fyptr, fzptr, fxacc, fyacc, fzacc, virialSumX, virialSumY, virialSumZ,
675 potentialEnergySum, drx_1, dry_1, drz_1, xptr, yptr, zptr, ownedStateJ_1, pgC_1,
676 epsilon24s_1, shift6s_1, lj6_1, fac_1);
679 fxptr[indexFirst] += svaddv_f64(svptrue_b64(), fxacc);
680 fyptr[indexFirst] += svaddv_f64(svptrue_b64(), fyacc);
681 fzptr[indexFirst] += svaddv_f64(svptrue_b64(), fzacc);
683 if constexpr (calculateGlobals) {
686 _aosThreadData[threadnum].potentialEnergySum += svaddv_f64(svptrue_b64(), potentialEnergySum);
687 _aosThreadData[threadnum].virialSum[0] += svaddv_f64(svptrue_b64(), virialSumX);
688 _aosThreadData[threadnum].virialSum[1] += svaddv_f64(svptrue_b64(), virialSumY);
689 _aosThreadData[threadnum].virialSum[2] += svaddv_f64(svptrue_b64(), virialSumZ);
699 return std::array<typename Particle_T::AttributeNames, 9>{Particle_T::AttributeNames::id,
700 Particle_T::AttributeNames::posX,
701 Particle_T::AttributeNames::posY,
702 Particle_T::AttributeNames::posZ,
703 Particle_T::AttributeNames::forceX,
704 Particle_T::AttributeNames::forceY,
705 Particle_T::AttributeNames::forceZ,
706 Particle_T::AttributeNames::typeId,
707 Particle_T::AttributeNames::ownershipState};
714 return std::array<typename Particle_T::AttributeNames, 6>{
715 Particle_T::AttributeNames::id, Particle_T::AttributeNames::posX,
716 Particle_T::AttributeNames::posY, Particle_T::AttributeNames::posZ,
717 Particle_T::AttributeNames::typeId, Particle_T::AttributeNames::ownershipState};
724 return std::array<typename Particle_T::AttributeNames, 3>{
725 Particle_T::AttributeNames::forceX, Particle_T::AttributeNames::forceY, Particle_T::AttributeNames::forceZ};
749 return newton3 ? 18ul : 15ul;
757 _potentialEnergySum = 0.;
758 _virialSum = {0., 0., 0.};
759 _postProcessed =
false;
760 for (
size_t i = 0; i < _aosThreadData.size(); ++i) {
761 _aosThreadData[i].setZero();
770 using namespace autopas::utils::ArrayMath::literals;
772 if (_postProcessed) {
774 "Already postprocessed, endTraversal(bool newton3) was called twice without calling initTraversal().");
776 if (calculateGlobals) {
777 for (
size_t i = 0; i < _aosThreadData.size(); ++i) {
778 _potentialEnergySum += _aosThreadData[i].potentialEnergySum;
779 _virialSum += _aosThreadData[i].virialSum;
783 _potentialEnergySum *= 0.5;
787 _potentialEnergySum /= 6.;
788 _postProcessed =
true;
790 AutoPasLog(DEBUG,
"Final potential energy {}", _potentialEnergySum);
791 AutoPasLog(DEBUG,
"Final virial {}", _virialSum[0] + _virialSum[1] + _virialSum[2]);
800 if (not calculateGlobals) {
802 "Trying to get potential energy even though calculateGlobals is false. If you want this functor to calculate "
804 "values, please specify calculateGlobals to be true.");
806 if (not _postProcessed) {
808 "Cannot get potential energy, because endTraversal was not called.");
810 return _potentialEnergySum;
818 if (not calculateGlobals) {
820 "Trying to get virial even though calculateGlobals is false. If you want this functor to calculate global "
821 "values, please specify calculateGlobals to be true.");
823 if (not _postProcessed) {
825 "Cannot get virial, because endTraversal was not called.");
827 return _virialSum[0] + _virialSum[1] + _virialSum[2];
839#ifdef __ARM_FEATURE_SVE
840 _epsilon24 = epsilon24;
841 _sigmaSquared = sigmaSquared;
842 if constexpr (applyShift) {
849 _epsilon24AoS = epsilon24;
850 _sigmaSquaredAoS = sigmaSquared;
851 if constexpr (applyShift) {
862 class AoSThreadData {
864 AoSThreadData() : virialSum{0., 0., 0.}, potentialEnergySum{0.}, __remainingTo64{} {}
866 virialSum = {0., 0., 0.};
867 potentialEnergySum = 0.;
871 std::array<double, 3> virialSum;
872 double potentialEnergySum;
876 double __remainingTo64[(64 - 4 *
sizeof(double)) /
sizeof(double)];
879 static_assert(
sizeof(AoSThreadData) % 64 == 0,
"AoSThreadData has wrong size");
881#ifdef __ARM_FEATURE_SVE
882 const double _cutoffSquared{};
884 double _epsilon24{0.};
885 double _sigmaSquared{0.};
888 const double _cutoffSquaredAoS;
889 double _epsilon24AoS{0.}, _sigmaSquaredAoS{0.}, _shift6AoS{0.};
894 double _potentialEnergySum{0.};
897 std::array<double, 3> _virialSum{0., 0., 0.};
900 std::vector<AoSThreadData> _aosThreadData{};
903 bool _postProcessed{
false};
#define AutoPasLog(lvl, fmt,...)
Macro for logging providing common meta information without filename.
Definition: Logger.h:24
This class stores the (physical) properties of molecule types, and, in the case of multi-site molecul...
Definition: ParticlePropertiesLibrary.h:28
floatType getMixingShift6(intType i, intType j) const
Returns precomputed mixed shift * 6 for one pair of site types.
Definition: ParticlePropertiesLibrary.h:262
floatType getMixingSigmaSquared(intType i, intType j) const
Returns precomputed mixed squared sigma for one pair of site types.
Definition: ParticlePropertiesLibrary.h:252
const double * getLJMixingDataPtr(intType i, intType j)
Get a pointer to Mixing Data for one pair of LJ site types.
Definition: ParticlePropertiesLibrary.h:242
floatType getMixing24Epsilon(intType i, intType j) const
Returns the precomputed mixed epsilon * 24.
Definition: ParticlePropertiesLibrary.h:224
static double calcShift6(double epsilon24, double sigmaSquared, double cutoffSquared)
Calculate the shift multiplied 6 of the lennard jones potential from given cutoff,...
Definition: ParticlePropertiesLibrary.h:575
AlignedAllocator class.
Definition: AlignedAllocator.h:29
PairwiseFunctor class.
Definition: PairwiseFunctor.h:31
View on a fixed part of a SoA between a start index and an end index.
Definition: SoAView.h:23
size_t size() const
Returns the number of particles in the view.
Definition: SoAView.h:83
Default exception class for autopas exceptions.
Definition: ExceptionHandler.h:115
static void exception(const Exception e)
Handle an exception derived by std::exception.
Definition: ExceptionHandler.h:63
A functor to handle lennard-jones interactions between two particles (molecules).
Definition: LJFunctorSVE.h:48
void initTraversal() final
Reset the global values.
Definition: LJFunctorSVE.h:756
double getVirial()
Get the virial.
Definition: LJFunctorSVE.h:817
bool isRelevantForTuning() final
Specifies whether the functor should be considered for the auto-tuning process.
Definition: LJFunctorSVE.h:117
void SoAFunctorPair(autopas::SoAView< SoAArraysType > soa1, autopas::SoAView< SoAArraysType > soa2, const bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorSVE.h:200
static constexpr bool getMixing()
Definition: LJFunctorSVE.h:732
LJFunctorSVE()=delete
Deleted default constructor.
static unsigned long getNumFlopsPerKernelCall(size_t molAType, size_t molBType, bool newton3)
Get the number of flops used per kernel call for a given particle pair.
Definition: LJFunctorSVE.h:745
LJFunctorSVE(double cutoff)
Constructor for Functor with mixing disabled.
Definition: LJFunctorSVE.h:95
static constexpr auto getNeededAttr()
Get attributes needed for computation.
Definition: LJFunctorSVE.h:698
void endTraversal(bool newton3) final
Accumulates global values, e.g.
Definition: LJFunctorSVE.h:769
static constexpr auto getNeededAttr(std::false_type)
Get attributes needed for computation without N3 optimization.
Definition: LJFunctorSVE.h:713
void setParticleProperties(double epsilon24, double sigmaSquared)
Sets the particle properties constants for this functor.
Definition: LJFunctorSVE.h:838
std::string getName() final
Returns name of functor.
Definition: LJFunctorSVE.h:115
double getPotentialEnergy()
Get the potential Energy.
Definition: LJFunctorSVE.h:799
LJFunctorSVE(double cutoff, ParticlePropertiesLibrary< double, size_t > &particlePropertiesLibrary)
Constructor for Functor with mixing active.
Definition: LJFunctorSVE.h:107
bool allowsNonNewton3() final
Specifies whether the functor is capable of non-Newton3-like functors.
Definition: LJFunctorSVE.h:123
void SoAFunctorSingle(autopas::SoAView< SoAArraysType > soa, bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorSVE.h:187
bool allowsNewton3() final
Specifies whether the functor is capable of Newton3-like functors.
Definition: LJFunctorSVE.h:119
static constexpr auto getComputedAttr()
Get attributes computed by this functor.
Definition: LJFunctorSVE.h:723
void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final
PairwiseFunctor for arrays of structures (AoS).
Definition: LJFunctorSVE.h:127
void SoAFunctorVerlet(autopas::SoAView< SoAArraysType > soa, const size_t indexFirst, const std::vector< size_t, autopas::AlignedAllocator< size_t > > &neighborList, bool newton3) final
PairwiseFunctor for structure of arrays (SoA) for neighbor lists.
Definition: LJFunctorSVE.h:604
constexpr T dot(const std::array< T, SIZE > &a, const std::array< T, SIZE > &b)
Generates the dot product of two arrays.
Definition: ArrayMath.h:233
int autopas_get_max_threads()
Dummy for omp_get_max_threads() when no OpenMP is available.
Definition: WrapOpenMP.h:144
@ dummy
Dummy or deleted state, a particle with this state is not an actual particle!
@ owned
Owned state, a particle with this state is an actual particle and owned by the current AutoPas object...
FunctorN3Modes
Newton 3 modes for the Functor.
Definition: Functor.h:22
int autopas_get_thread_num()
Dummy for omp_set_lock() when no OpenMP is available.
Definition: WrapOpenMP.h:132