AutoPas  3.0.0
Loading...
Searching...
No Matches
LJFunctorSVE.h
Go to the documentation of this file.
1
7#pragma once
8
9#ifndef __ARM_FEATURE_SVE
10#pragma message "Requested to compile LJFunctorSVE but SVE is not available!"
11#else
12#include <arm_sve.h>
13#endif
14
15#include <array>
16
24#include "autopas/utils/inBox.h"
25
26namespace mdLib {
27
46template <class Particle_T, bool applyShift = false, bool useMixing = false,
47 autopas::FunctorN3Modes useNewton3 = autopas::FunctorN3Modes::Both, bool calculateGlobals = false,
48 bool countFLOPs = false, bool relevantForTuning = true>
50 : public autopas::PairwiseFunctor<Particle_T, LJFunctorSVE<Particle_T, applyShift, useMixing, useNewton3,
51 calculateGlobals, countFLOPs, relevantForTuning>> {
52 using SoAArraysType = typename Particle_T::SoAArraysType;
53
54 public:
58 LJFunctorSVE() = delete;
59
60 private:
66 explicit LJFunctorSVE(double cutoff, void * /*dummy*/)
67#ifdef __ARM_FEATURE_SVE
68 : autopas::PairwiseFunctor<Particle_T, LJFunctorSVE<Particle_T, applyShift, useMixing, useNewton3,
69 calculateGlobals, countFLOPs, relevantForTuning>>(cutoff),
70 _cutoffSquared{cutoff * cutoff},
71 _cutoffSquaredAoS(cutoff * cutoff),
72 _potentialEnergySum{0.},
73 _virialSum{0., 0., 0.},
74 _aosThreadData(),
75 _postProcessed{false} {
76 if (calculateGlobals) {
77 _aosThreadData.resize(autopas::autopas_get_max_threads());
78 }
79 if constexpr (countFLOPs) {
80 AutoPasLog(DEBUG, "Using LJFunctorSVE with countFLOPs but FLOP counting is not implemented.");
81 }
82 }
83#else
84 : autopas::PairwiseFunctor<Particle_T, LJFunctorSVE<Particle_T, applyShift, useMixing, useNewton3,
85 calculateGlobals, countFLOPs, relevantForTuning>>(cutoff) {
86 autopas::utils::ExceptionHandler::exception("AutoPas was compiled without SVE support!");
87 }
88#endif
89 public:
98 explicit LJFunctorSVE(double cutoff) : LJFunctorSVE(cutoff, nullptr) {
99 static_assert(not useMixing,
100 "Mixing without a ParticlePropertiesLibrary is not possible! Use a different constructor or set "
101 "mixing to false.");
102 }
103
110 explicit LJFunctorSVE(double cutoff, ParticlePropertiesLibrary<double, size_t> &particlePropertiesLibrary)
111 : LJFunctorSVE(cutoff, nullptr) {
112 static_assert(useMixing,
113 "Not using Mixing but using a ParticlePropertiesLibrary is not allowed! Use a different constructor "
114 "or set mixing to true.");
115 _PPLibrary = &particlePropertiesLibrary;
116 }
117
118 std::string getName() final { return "LJFunctorSVE"; }
119
120 bool isRelevantForTuning() final { return relevantForTuning; }
121
122 bool allowsNewton3() final {
123 return useNewton3 == autopas::FunctorN3Modes::Newton3Only or useNewton3 == autopas::FunctorN3Modes::Both;
124 }
125
126 bool allowsNonNewton3() final {
127 return useNewton3 == autopas::FunctorN3Modes::Newton3Off or useNewton3 == autopas::FunctorN3Modes::Both;
128 }
129
130 void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final {
131 using namespace autopas::utils::ArrayMath::literals;
132
133 if (i.isDummy() or j.isDummy()) {
134 return;
135 }
136 auto sigmaSquared = _sigmaSquaredAoS;
137 auto epsilon24 = _epsilon24AoS;
138 auto shift6 = _shift6AoS;
139 if constexpr (useMixing) {
140 sigmaSquared = _PPLibrary->getMixingSigmaSquared(i.getTypeId(), j.getTypeId());
141 epsilon24 = _PPLibrary->getMixing24Epsilon(i.getTypeId(), j.getTypeId());
142 if constexpr (applyShift) {
143 shift6 = _PPLibrary->getMixingShift6(i.getTypeId(), j.getTypeId());
144 }
145 }
146 auto dr = i.getR() - j.getR();
147 double dr2 = autopas::utils::ArrayMath::dot(dr, dr);
148
149 if (dr2 > _cutoffSquaredAoS) {
150 return;
151 }
152
153 double invdr2 = 1. / dr2;
154 double lj6 = sigmaSquared * invdr2;
155 lj6 = lj6 * lj6 * lj6;
156 double lj12 = lj6 * lj6;
157 double lj12m6 = lj12 - lj6;
158 double fac = epsilon24 * (lj12 + lj12m6) * invdr2;
159 auto f = dr * fac;
160 i.addF(f);
161 if (newton3) {
162 // only if we use newton 3 here, we want to
163 j.subF(f);
164 }
165 if (calculateGlobals) {
166 // We always add the full contribution for each owned particle and divide the sums by 2 in endTraversal().
167 // Potential energy has an additional factor of 6, which is also handled in endTraversal().
168
169 auto virial = dr * f;
170 double potentialEnergy6 = epsilon24 * lj12m6 + shift6;
171
172 const int threadnum = autopas::autopas_get_thread_num();
173 if (i.isOwned()) {
174 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
175 _aosThreadData[threadnum].virialSum += virial;
176 }
177 // for non-newton3 the second particle will be considered in a separate calculation
178 if (newton3 and j.isOwned()) {
179 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
180 _aosThreadData[threadnum].virialSum += virial;
181 }
182 }
183 }
184
190 void SoAFunctorSingle(autopas::SoAView<SoAArraysType> soa, bool newton3) final {
191 if (newton3) {
192 SoAFunctorSingleImpl<true>(soa);
193 } else {
194 SoAFunctorSingleImpl<false>(soa);
195 }
196 }
197
198 // clang-format off
202 // clang-format on
204 const bool newton3) final {
205 if (newton3) {
206 SoAFunctorPairImpl<true>(soa1, soa2);
207 } else {
208 SoAFunctorPairImpl<false>(soa1, soa2);
209 }
210 }
211
212 private:
218 template <bool newton3>
219 void SoAFunctorSingleImpl(autopas::SoAView<SoAArraysType> soa) {
220#ifdef __ARM_FEATURE_SVE
221 if (soa.size() == 0) return;
222
223 const auto *const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
224 const auto *const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
225 const auto *const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
226
227 const auto *const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
228
229 auto *const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
230 auto *const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
231 auto *const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
232
233 const auto *const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
234
235 svfloat64_t virialSumX = svdup_f64(0.0);
236 svfloat64_t virialSumY = svdup_f64(0.0);
237 svfloat64_t virialSumZ = svdup_f64(0.0);
238 svfloat64_t potentialEnergySum = svdup_f64(0.0);
239
240 const auto vecLength = (size_t)svlen_f64(potentialEnergySum);
241
242 // reverse outer loop s.th. inner loop always beginns at aligned array start
243 // typecast to detect underflow
244 for (size_t i = soa.size() - 1; (long)i >= 0; --i) {
245 if (ownedStatePtr[i] == autopas::OwnershipState::dummy) {
246 // If the i-th particle is a dummy, skip this loop iteration.
247 continue;
248 }
249
250 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
251 "OwnershipStates underlying type should be int64_t!");
252
253 svfloat64_t fxacc = svdup_f64(0.0);
254 svfloat64_t fyacc = svdup_f64(0.0);
255 svfloat64_t fzacc = svdup_f64(0.0);
256
257 const svfloat64_t x1 = svdup_f64(xptr[i]);
258 const svfloat64_t y1 = svdup_f64(yptr[i]);
259 const svfloat64_t z1 = svdup_f64(zptr[i]);
260
261 svbool_t pg_1, pg_2, pg_3, pg_4;
262 size_t j = 0;
263 for (; j < i; j += vecLength * 4) {
264 const size_t j_2 = j + vecLength;
265 const size_t j_3 = j_2 + vecLength;
266 const size_t j_4 = j_3 + vecLength;
267 pg_1 = svwhilelt_b64(j, i);
268 pg_2 = svwhilelt_b64(j_2, i);
269 pg_3 = svwhilelt_b64(j_3, i);
270 pg_4 = svwhilelt_b64(j_4, i);
271
272 SoAKernel<true, false>(j, ownedStatePtr[i] == autopas::OwnershipState::owned,
273 reinterpret_cast<const int64_t *>(ownedStatePtr), x1, y1, z1, xptr, yptr, zptr, fxptr,
274 fyptr, fzptr, &typeIDptr[i], typeIDptr, fxacc, fyacc, fzacc, virialSumX, virialSumY,
275 virialSumZ, potentialEnergySum, pg_1, svundef_u64(), pg_2, svundef_u64(), pg_3,
276 svundef_u64(), pg_4, svundef_u64());
277 }
278
279 fxptr[i] += svaddv(svptrue_b64(), fxacc);
280 fyptr[i] += svaddv(svptrue_b64(), fyacc);
281 fzptr[i] += svaddv(svptrue_b64(), fzacc);
282 }
283
284 if constexpr (calculateGlobals) {
285 const int threadnum = autopas::autopas_get_thread_num();
286
287 _aosThreadData[threadnum].potentialEnergySum += svaddv_f64(svptrue_b64(), potentialEnergySum);
288 _aosThreadData[threadnum].virialSum[0] += svaddv_f64(svptrue_b64(), virialSumX);
289 _aosThreadData[threadnum].virialSum[1] += svaddv_f64(svptrue_b64(), virialSumY);
290 _aosThreadData[threadnum].virialSum[2] += svaddv_f64(svptrue_b64(), virialSumZ);
291 }
292#endif
293 }
294
295 template <bool newton3>
296 void SoAFunctorPairImpl(autopas::SoAView<SoAArraysType> soa1, autopas::SoAView<SoAArraysType> soa2) {
297#ifdef __ARM_FEATURE_SVE
298 if (soa1.size() == 0 || soa2.size() == 0) return;
299
300 const auto *const __restrict x1ptr = soa1.template begin<Particle_T::AttributeNames::posX>();
301 const auto *const __restrict y1ptr = soa1.template begin<Particle_T::AttributeNames::posY>();
302 const auto *const __restrict z1ptr = soa1.template begin<Particle_T::AttributeNames::posZ>();
303 const auto *const __restrict x2ptr = soa2.template begin<Particle_T::AttributeNames::posX>();
304 const auto *const __restrict y2ptr = soa2.template begin<Particle_T::AttributeNames::posY>();
305 const auto *const __restrict z2ptr = soa2.template begin<Particle_T::AttributeNames::posZ>();
306
307 const auto *const __restrict ownedStatePtr1 = soa1.template begin<Particle_T::AttributeNames::ownershipState>();
308 const auto *const __restrict ownedStatePtr2 = soa2.template begin<Particle_T::AttributeNames::ownershipState>();
309
310 auto *const __restrict fx1ptr = soa1.template begin<Particle_T::AttributeNames::forceX>();
311 auto *const __restrict fy1ptr = soa1.template begin<Particle_T::AttributeNames::forceY>();
312 auto *const __restrict fz1ptr = soa1.template begin<Particle_T::AttributeNames::forceZ>();
313 auto *const __restrict fx2ptr = soa2.template begin<Particle_T::AttributeNames::forceX>();
314 auto *const __restrict fy2ptr = soa2.template begin<Particle_T::AttributeNames::forceY>();
315 auto *const __restrict fz2ptr = soa2.template begin<Particle_T::AttributeNames::forceZ>();
316
317 const auto *const __restrict typeID1ptr = soa1.template begin<Particle_T::AttributeNames::typeId>();
318 const auto *const __restrict typeID2ptr = soa2.template begin<Particle_T::AttributeNames::typeId>();
319
320 svfloat64_t virialSumX = svdup_f64(0.0);
321 svfloat64_t virialSumY = svdup_f64(0.0);
322 svfloat64_t virialSumZ = svdup_f64(0.0);
323 svfloat64_t potentialEnergySum = svdup_f64(0.0);
324
325 const auto vecLength = (unsigned int)svlen_f64(potentialEnergySum);
326
327 for (unsigned int i = 0; i < soa1.size(); ++i) {
328 if (ownedStatePtr1[i] == autopas::OwnershipState::dummy) {
329 // If the i-th particle is a dummy, skip this loop iteration.
330 continue;
331 }
332
333 svfloat64_t fxacc = svdup_f64(0.0);
334 svfloat64_t fyacc = svdup_f64(0.0);
335 svfloat64_t fzacc = svdup_f64(0.0);
336
337 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
338 "OwnershipStates underlying type should be int64_t!");
339
340 const svfloat64_t x1 = svdup_f64(x1ptr[i]);
341 const svfloat64_t y1 = svdup_f64(y1ptr[i]);
342 const svfloat64_t z1 = svdup_f64(z1ptr[i]);
343
344 svbool_t pg_1, pg_2, pg_3, pg_4;
345 unsigned int j = 0;
346 for (; j < soa2.size(); j += vecLength * 4) {
347 const unsigned int j_2 = j + vecLength;
348 const unsigned int j_3 = j_2 + vecLength;
349 const unsigned int j_4 = j_3 + vecLength;
350 pg_1 = svwhilelt_b64(j, (unsigned int)soa2.size());
351 pg_2 = svwhilelt_b64(j_2, (unsigned int)soa2.size());
352 pg_3 = svwhilelt_b64(j_3, (unsigned int)soa2.size());
353 pg_4 = svwhilelt_b64(j_4, (unsigned int)soa2.size());
354
355 SoAKernel<newton3, false>(j, ownedStatePtr1[i] == autopas::OwnershipState::owned,
356 reinterpret_cast<const int64_t *>(ownedStatePtr2), x1, y1, z1, x2ptr, y2ptr, z2ptr,
357 fx2ptr, fy2ptr, fz2ptr, typeID1ptr, typeID2ptr, fxacc, fyacc, fzacc, virialSumX,
358 virialSumY, virialSumZ, potentialEnergySum, pg_1, svundef_u64(), pg_2, svundef_u64(),
359 pg_3, svundef_u64(), pg_4, svundef_u64());
360 }
361
362 fx1ptr[i] += svaddv_f64(svptrue_b64(), fxacc);
363 fy1ptr[i] += svaddv_f64(svptrue_b64(), fyacc);
364 fz1ptr[i] += svaddv_f64(svptrue_b64(), fzacc);
365 }
366
367 if constexpr (calculateGlobals) {
368 const int threadnum = autopas::autopas_get_thread_num();
369
370 _aosThreadData[threadnum].potentialEnergySum += svaddv_f64(svptrue_b64(), potentialEnergySum);
371 _aosThreadData[threadnum].virialSum[0] += svaddv_f64(svptrue_b64(), virialSumX);
372 _aosThreadData[threadnum].virialSum[1] += svaddv_f64(svptrue_b64(), virialSumY);
373 _aosThreadData[threadnum].virialSum[2] += svaddv_f64(svptrue_b64(), virialSumZ);
374 }
375#endif
376 }
377#ifdef __ARM_FEATURE_SVE
378 template <bool indexed>
379 inline svbool_t distCalc(const size_t j, const svuint64_t &index, const svfloat64_t &x1, const svfloat64_t &y1,
380 const svfloat64_t &z1, const svbool_t &pg, const int64_t *const __restrict ownedStatePtr2,
381 const double *const __restrict x2ptr, const double *const __restrict y2ptr,
382 const double *const __restrict z2ptr, svfloat64_t &drx, svfloat64_t &dry, svfloat64_t &drz,
383 svfloat64_t &dr2, svint64_t &ownedStateJ) {
384 const svfloat64_t x2 = (indexed) ? svld1_gather_index(pg, x2ptr, index) : svld1(pg, &x2ptr[j]);
385 const svfloat64_t y2 = (indexed) ? svld1_gather_index(pg, y2ptr, index) : svld1(pg, &y2ptr[j]);
386 const svfloat64_t z2 = (indexed) ? svld1_gather_index(pg, z2ptr, index) : svld1(pg, &z2ptr[j]);
387
388 // having these three as not _m worsens performance
389 drx = svsub_m(pg, x1, x2);
390 dry = svsub_m(pg, y1, y2);
391 drz = svsub_m(pg, z1, z2);
392
393 const svfloat64_t dr2_1 = svmul_x(pg, drx, drx);
394 const svfloat64_t dr2_2 = svmla_x(pg, dr2_1, dry, dry);
395 dr2 = svmla_x(pg, dr2_2, drz, drz);
396
397 const svbool_t cutoffMask = svcmple(pg, dr2, _cutoffSquared);
398
399 ownedStateJ = (indexed) ? svld1_gather_index(pg, ownedStatePtr2, index) : svld1(pg, &ownedStatePtr2[j]);
400 const svbool_t dummyMask = svcmpne(pg, ownedStateJ, (int64_t)autopas::OwnershipState::dummy);
401 return svand_z(pg, cutoffMask, dummyMask);
402 }
403
404 template <bool indexed>
405 inline void lennardJones(const svuint64_t &index, const size_t *const typeID1ptr, const size_t *const typeID2ptr,
406 const svbool_t &pgC, const svfloat64_t &dr2, svfloat64_t &epsilon24s, svfloat64_t &shift6s,
407 svfloat64_t &lj6, svfloat64_t &fac) {
408 const svuint64_t typeIds =
409 useMixing ? svmul_m(pgC, (indexed) ? svld1_gather_index(pgC, typeID2ptr, index) : svld1_u64(pgC, typeID2ptr), 3)
410 : svundef_u64();
411 const auto mixingDataPtr = useMixing ? _PPLibrary->getLJMixingDataPtr(*typeID1ptr, 0) : nullptr;
412
413 const svfloat64_t sigmaSquareds =
414 useMixing ? svld1_gather_index(pgC, mixingDataPtr + 1, typeIds) : svdup_f64(_sigmaSquared);
415 epsilon24s = useMixing ? svld1_gather_index(pgC, mixingDataPtr, typeIds) : svdup_f64(_epsilon24);
416 shift6s = (useMixing && applyShift) ? svld1_gather_index(pgC, mixingDataPtr + 2, typeIds) : svdup_f64(_shift6);
417
418 svfloat64_t invdr2 = svrecpe(dr2);
419 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
420 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
421 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
422 invdr2 = svmul_x(pgC, invdr2, svrecps(dr2, invdr2));
423 const svfloat64_t lj2 = svmul_x(pgC, sigmaSquareds, invdr2);
424
425 const svfloat64_t lj4 = svmul_x(pgC, lj2, lj2);
426 lj6 = svmul_x(pgC, lj2, lj4);
427 const svfloat64_t lj12 = svmul_x(pgC, lj6, lj6);
428
429 //(2*lj12 - lj6) * epsilon24 * invdr2 = -(lj6 - 2*lj12) * epsilon24 * invdr2
430
431 const svfloat64_t lj12m6alj12 = svnmls_x(pgC, lj6, lj12, 2);
432
433 const svfloat64_t lj12m6alj12e = svmul_x(pgC, lj12m6alj12, epsilon24s);
434 fac = svmul_x(pgC, lj12m6alj12e, invdr2);
435 }
436
437 template <bool newton3, bool indexed>
438 inline void applyForces(const size_t j, const svuint64_t &index, const bool ownedStateIisOwned,
439 double *const __restrict fx2ptr, double *const __restrict fy2ptr,
440 double *const __restrict fz2ptr, svfloat64_t &fxacc, svfloat64_t &fyacc, svfloat64_t &fzacc,
441 svfloat64_t &virialSumX, svfloat64_t &virialSumY, svfloat64_t &virialSumZ,
442 svfloat64_t &potentialEnergySum, const svfloat64_t &drx, const svfloat64_t &dry,
443 const svfloat64_t &drz, const double *const __restrict x2ptr,
444 const double *const __restrict y2ptr, const double *const __restrict z2ptr,
445 const svint64_t &ownedStateJ, const svbool_t &pgC, const svfloat64_t &epsilon24s,
446 const svfloat64_t &shift6s, const svfloat64_t &lj6, const svfloat64_t &fac) {
447 const svfloat64_t fx = svmul_x(pgC, drx, fac);
448 const svfloat64_t fy = svmul_x(pgC, dry, fac);
449 const svfloat64_t fz = svmul_x(pgC, drz, fac);
450
451 fxacc = svadd_f64_m(pgC, fxacc, fx);
452 fyacc = svadd_f64_m(pgC, fyacc, fy);
453 fzacc = svadd_f64_m(pgC, fzacc, fz);
454
455 if (newton3) {
456 const svfloat64_t fx2 = (indexed) ? svld1_gather_index(pgC, fx2ptr, index) : svld1_f64(pgC, &fx2ptr[j]);
457 const svfloat64_t fy2 = (indexed) ? svld1_gather_index(pgC, fy2ptr, index) : svld1_f64(pgC, &fy2ptr[j]);
458 const svfloat64_t fz2 = (indexed) ? svld1_gather_index(pgC, fz2ptr, index) : svld1_f64(pgC, &fz2ptr[j]);
459
460 const svfloat64_t fx2new = svsub_x(pgC, fx2, fx);
461 const svfloat64_t fy2new = svsub_x(pgC, fy2, fy);
462 const svfloat64_t fz2new = svsub_x(pgC, fz2, fz);
463
464 if constexpr (indexed) {
465 svst1_scatter_index(pgC, &fx2ptr[0], index, fx2new);
466 svst1_scatter_index(pgC, &fy2ptr[0], index, fy2new);
467 svst1_scatter_index(pgC, &fz2ptr[0], index, fz2new);
468 } else {
469 svst1(pgC, &fx2ptr[j], fx2new);
470 svst1(pgC, &fy2ptr[j], fy2new);
471 svst1(pgC, &fz2ptr[j], fz2new);
472 }
473 }
474
475 if (calculateGlobals) {
476 // Global Virial
477 const svfloat64_t virialX = svmul_x(pgC, fx, drx);
478 const svfloat64_t virialY = svmul_x(pgC, fy, dry);
479 const svfloat64_t virialZ = svmul_x(pgC, fz, drz);
480
481 // Global Potential
482 const svfloat64_t lj12m6 = svnmls_x(pgC, lj6, lj6, lj6);
483 const svfloat64_t potentialEnergy6 = svmad_x(pgC, epsilon24s, lj12m6, shift6s);
484 svfloat64_t energyFactor = svdup_f64(ownedStateIisOwned ? 1.0 : 0.0);
485
486 if constexpr (newton3) {
487 svbool_t ownedMaskJ = svcmpeq(pgC, ownedStateJ, (int64_t)autopas::OwnershipState::owned);
488 energyFactor = svadd_m(ownedMaskJ, energyFactor, 1.0);
489 }
490 potentialEnergySum = svmla_m(pgC, potentialEnergySum, energyFactor, potentialEnergy6);
491 virialSumX = svmla_m(pgC, virialSumX, energyFactor, virialX);
492 virialSumY = svmla_m(pgC, virialSumY, energyFactor, virialY);
493 virialSumZ = svmla_m(pgC, virialSumZ, energyFactor, virialZ);
494 }
495 }
496
497 template <bool newton3, bool indexed>
498 // FCC needs to be forced to inline this function. Otherwise a dramatic loss in performance can be observed.
499 __attribute__((always_inline)) inline void SoAKernel(
500 const size_t j, const bool ownedStateIisOwned, const int64_t *const __restrict ownedStatePtr2,
501 const svfloat64_t &x1, const svfloat64_t &y1, const svfloat64_t &z1, const double *const __restrict x2ptr,
502 const double *const __restrict y2ptr, const double *const __restrict z2ptr, double *const __restrict fx2ptr,
503 double *const __restrict fy2ptr, double *const __restrict fz2ptr, const size_t *const typeID1ptr,
504 const size_t *const typeID2ptr, svfloat64_t &fxacc, svfloat64_t &fyacc, svfloat64_t &fzacc,
505 svfloat64_t &virialSumX, svfloat64_t &virialSumY, svfloat64_t &virialSumZ, svfloat64_t &potentialEnergySum,
506
507 const svbool_t &pg_1, const svuint64_t &index_1, const svbool_t &pg_2, const svuint64_t &index_2,
508 const svbool_t &pg_3, const svuint64_t &index_3, const svbool_t &pg_4, const svuint64_t &index_4
509
510 ) {
511 svfloat64_t drx_1;
512 svfloat64_t dry_1;
513 svfloat64_t drz_1;
514 svfloat64_t dr2_1;
515 svint64_t ownedStateJ_1;
516 const svbool_t pgC_1 = distCalc<indexed>(j, index_1, x1, y1, z1, pg_1, ownedStatePtr2, x2ptr, y2ptr, z2ptr, drx_1,
517 dry_1, drz_1, dr2_1, ownedStateJ_1);
518
519 svfloat64_t drx_2;
520 svfloat64_t dry_2;
521 svfloat64_t drz_2;
522 svfloat64_t dr2_2;
523 svint64_t ownedStateJ_2;
524 const svbool_t pgC_2 = distCalc<indexed>(j + svlen(x1), index_2, x1, y1, z1, pg_2, ownedStatePtr2, x2ptr, y2ptr,
525 z2ptr, drx_2, dry_2, drz_2, dr2_2, ownedStateJ_2);
526
527 svfloat64_t drx_3;
528 svfloat64_t dry_3;
529 svfloat64_t drz_3;
530 svfloat64_t dr2_3;
531 svint64_t ownedStateJ_3;
532 const svbool_t pgC_3 = distCalc<indexed>(j + svlen(x1) * 2, index_3, x1, y1, z1, pg_3, ownedStatePtr2, x2ptr, y2ptr,
533 z2ptr, drx_3, dry_3, drz_3, dr2_3, ownedStateJ_3);
534
535 svfloat64_t drx_4;
536 svfloat64_t dry_4;
537 svfloat64_t drz_4;
538 svfloat64_t dr2_4;
539 svint64_t ownedStateJ_4;
540 const svbool_t pgC_4 = distCalc<indexed>(j + svlen(x1) * 3, index_4, x1, y1, z1, pg_4, ownedStatePtr2, x2ptr, y2ptr,
541 z2ptr, drx_4, dry_4, drz_4, dr2_4, ownedStateJ_4);
542
543 const bool continue_1 = svptest_any(svptrue_b64(), pgC_1);
544 const bool continue_2 = svptest_any(svptrue_b64(), pgC_2);
545 const bool continue_3 = svptest_any(svptrue_b64(), pgC_3);
546 const bool continue_4 = svptest_any(svptrue_b64(), pgC_4);
547
548 svfloat64_t epsilon24s_1;
549 svfloat64_t shift6s_1;
550 svfloat64_t lj6_1;
551 svfloat64_t fac_1;
552 if (continue_1)
553 lennardJones<indexed>(index_1, typeID1ptr, typeID2ptr, pgC_1, dr2_1, epsilon24s_1, shift6s_1, lj6_1, fac_1);
554
555 svfloat64_t epsilon24s_2;
556 svfloat64_t shift6s_2;
557 svfloat64_t lj6_2;
558 svfloat64_t fac_2;
559 if (continue_2)
560 lennardJones<indexed>(index_2, typeID1ptr, typeID2ptr, pgC_2, dr2_2, epsilon24s_2, shift6s_2, lj6_2, fac_2);
561
562 svfloat64_t epsilon24s_3;
563 svfloat64_t shift6s_3;
564 svfloat64_t lj6_3;
565 svfloat64_t fac_3;
566 if (continue_3)
567 lennardJones<indexed>(index_3, typeID1ptr, typeID2ptr, pgC_3, dr2_3, epsilon24s_3, shift6s_3, lj6_3, fac_3);
568
569 svfloat64_t epsilon24s_4;
570 svfloat64_t shift6s_4;
571 svfloat64_t lj6_4;
572 svfloat64_t fac_4;
573 if (continue_4)
574 lennardJones<indexed>(index_4, typeID1ptr, typeID2ptr, pgC_4, dr2_4, epsilon24s_4, shift6s_4, lj6_4, fac_4);
575
576 if (continue_1)
577 applyForces<newton3, indexed>(j, index_1, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc, fyacc, fzacc,
578 virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_1, dry_1, drz_1, x2ptr,
579 y2ptr, z2ptr, ownedStateJ_1, pgC_1, epsilon24s_1, shift6s_1, lj6_1, fac_1);
580 if (continue_2)
581 applyForces<newton3, indexed>(j + svlen(x1), index_2, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc, fyacc,
582 fzacc, virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_2, dry_2, drz_2,
583 x2ptr, y2ptr, z2ptr, ownedStateJ_2, pgC_2, epsilon24s_2, shift6s_2, lj6_2, fac_2);
584
585 if (continue_3)
586 applyForces<newton3, indexed>(j + svlen(x1) * 2, index_3, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc,
587 fyacc, fzacc, virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_3, dry_3,
588 drz_3, x2ptr, y2ptr, z2ptr, ownedStateJ_3, pgC_3, epsilon24s_3, shift6s_3, lj6_3,
589 fac_3);
590
591 if (continue_4)
592 applyForces<newton3, indexed>(j + svlen(x1) * 3, index_4, ownedStateIisOwned, fx2ptr, fy2ptr, fz2ptr, fxacc,
593 fyacc, fzacc, virialSumX, virialSumY, virialSumZ, potentialEnergySum, drx_4, dry_4,
594 drz_4, x2ptr, y2ptr, z2ptr, ownedStateJ_4, pgC_4, epsilon24s_4, shift6s_4, lj6_4,
595 fac_4);
596 }
597#endif
598
599 public:
600 // clang-format off
606 // clang-format on
607 void SoAFunctorVerlet(autopas::SoAView<SoAArraysType> soa, const size_t indexFirst,
608 const std::vector<size_t, autopas::AlignedAllocator<size_t>> &neighborList,
609 bool newton3) final {
610 if (soa.size() == 0 or neighborList.empty()) return;
611 if (newton3) {
612 SoAFunctorVerletImpl<true>(soa, indexFirst, neighborList);
613 } else {
614 SoAFunctorVerletImpl<false>(soa, indexFirst, neighborList);
615 }
616 }
617
618 private:
619 template <bool newton3>
620 void SoAFunctorVerletImpl(autopas::SoAView<SoAArraysType> soa, const size_t indexFirst,
621 const std::vector<size_t, autopas::AlignedAllocator<size_t>> &neighborList) {
622#ifdef __ARM_FEATURE_SVE
623 const auto *const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
624 if (ownedStatePtr[indexFirst] == autopas::OwnershipState::dummy) {
625 return;
626 }
627 const auto *const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
628 const auto *const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
629 const auto *const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
630
631 auto *const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
632 auto *const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
633 auto *const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
634
635 const auto *const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
636
637 // accumulators
638 svfloat64_t virialSumX = svdup_f64(0.0);
639 svfloat64_t virialSumY = svdup_f64(0.0);
640 svfloat64_t virialSumZ = svdup_f64(0.0);
641 svfloat64_t potentialEnergySum = svdup_f64(0.0);
642
643 svfloat64_t fxacc = svdup_f64(0.0);
644 svfloat64_t fyacc = svdup_f64(0.0);
645 svfloat64_t fzacc = svdup_f64(0.0);
646
647 // broadcast particle 1
648 const auto x1 = svdup_f64(xptr[indexFirst]);
649 const auto y1 = svdup_f64(yptr[indexFirst]);
650 const auto z1 = svdup_f64(zptr[indexFirst]);
651
652 svbool_t pg_1;
653 const auto *const ownedStatePtr2 = reinterpret_cast<const int64_t *>(ownedStatePtr);
654 size_t j = 0;
655 for (; j < neighborList.size(); j += svlen(x1)) {
656 pg_1 = svwhilelt_b64(j, neighborList.size());
657 const svuint64_t index_1 = svld1(pg_1, &neighborList[j]);
658
659 svfloat64_t drx_1;
660 svfloat64_t dry_1;
661 svfloat64_t drz_1;
662 svfloat64_t dr2_1;
663 svint64_t ownedStateJ_1;
664 const svbool_t pgC_1 = distCalc<true>(0, index_1, x1, y1, z1, pg_1, ownedStatePtr2, xptr, yptr, zptr, drx_1,
665 dry_1, drz_1, dr2_1, ownedStateJ_1);
666
667 const bool continue_1 = svptest_any(svptrue_b64(), pgC_1);
668 svfloat64_t epsilon24s_1;
669 svfloat64_t shift6s_1;
670 svfloat64_t lj6_1;
671 svfloat64_t fac_1;
672 if (continue_1)
673 lennardJones<true>(index_1, typeIDptr, typeIDptr, pgC_1, dr2_1, epsilon24s_1, shift6s_1, lj6_1, fac_1);
674
675 if (continue_1)
676 applyForces<newton3, true>(0, index_1, ownedStatePtr[indexFirst] == autopas::OwnershipState::owned, fxptr,
677 fyptr, fzptr, fxacc, fyacc, fzacc, virialSumX, virialSumY, virialSumZ,
678 potentialEnergySum, drx_1, dry_1, drz_1, xptr, yptr, zptr, ownedStateJ_1, pgC_1,
679 epsilon24s_1, shift6s_1, lj6_1, fac_1);
680 }
681
682 fxptr[indexFirst] += svaddv_f64(svptrue_b64(), fxacc);
683 fyptr[indexFirst] += svaddv_f64(svptrue_b64(), fyacc);
684 fzptr[indexFirst] += svaddv_f64(svptrue_b64(), fzacc);
685
686 if constexpr (calculateGlobals) {
687 const int threadnum = autopas::autopas_get_thread_num();
688
689 _aosThreadData[threadnum].potentialEnergySum += svaddv_f64(svptrue_b64(), potentialEnergySum);
690 _aosThreadData[threadnum].virialSum[0] += svaddv_f64(svptrue_b64(), virialSumX);
691 _aosThreadData[threadnum].virialSum[1] += svaddv_f64(svptrue_b64(), virialSumY);
692 _aosThreadData[threadnum].virialSum[2] += svaddv_f64(svptrue_b64(), virialSumZ);
693 }
694#endif
695 }
696
697 public:
701 constexpr static auto getNeededAttr() {
702 return std::array<typename Particle_T::AttributeNames, 9>{Particle_T::AttributeNames::id,
703 Particle_T::AttributeNames::posX,
704 Particle_T::AttributeNames::posY,
705 Particle_T::AttributeNames::posZ,
706 Particle_T::AttributeNames::forceX,
707 Particle_T::AttributeNames::forceY,
708 Particle_T::AttributeNames::forceZ,
709 Particle_T::AttributeNames::typeId,
710 Particle_T::AttributeNames::ownershipState};
711 }
712
716 constexpr static auto getNeededAttr(std::false_type) {
717 return std::array<typename Particle_T::AttributeNames, 6>{
718 Particle_T::AttributeNames::id, Particle_T::AttributeNames::posX,
719 Particle_T::AttributeNames::posY, Particle_T::AttributeNames::posZ,
720 Particle_T::AttributeNames::typeId, Particle_T::AttributeNames::ownershipState};
721 }
722
726 constexpr static auto getComputedAttr() {
727 return std::array<typename Particle_T::AttributeNames, 3>{
728 Particle_T::AttributeNames::forceX, Particle_T::AttributeNames::forceY, Particle_T::AttributeNames::forceZ};
729 }
730
735 constexpr static bool getMixing() { return useMixing; }
736
748 static unsigned long getNumFlopsPerKernelCall(size_t molAType, size_t molBType, bool newton3) {
749 // Kernel: 12 = 1 (inverse R squared) + 8 (compute scale) + 3 (apply scale) sum
750 // Adding to particle forces: 6 or 3 depending newton3
751 // Total = 12 + (6 or 3) = 18 or 15
752 return newton3 ? 18ul : 15ul;
753 }
754
759 void initTraversal() final {
760 _potentialEnergySum = 0.;
761 _virialSum = {0., 0., 0.};
762 _postProcessed = false;
763 for (size_t i = 0; i < _aosThreadData.size(); ++i) {
764 _aosThreadData[i].setZero();
765 }
766 }
767
772 void endTraversal(bool newton3) final {
773 using namespace autopas::utils::ArrayMath::literals;
774
775 if (_postProcessed) {
777 "Already postprocessed, endTraversal(bool newton3) was called twice without calling initTraversal().");
778 }
779 if (calculateGlobals) {
780 for (size_t i = 0; i < _aosThreadData.size(); ++i) {
781 _potentialEnergySum += _aosThreadData[i].potentialEnergySum;
782 _virialSum += _aosThreadData[i].virialSum;
783 }
784 // For each interaction, we added the full contribution for both particles. Divide by 2 here, so that each
785 // contribution is only counted once per pair.
786 _potentialEnergySum *= 0.5;
787 _virialSum *= 0.5;
788
789 // We have always calculated 6*potentialEnergy, so we divide by 6 here!
790 _potentialEnergySum /= 6.;
791 _postProcessed = true;
792
793 AutoPasLog(DEBUG, "Final potential energy {}", _potentialEnergySum);
794 AutoPasLog(DEBUG, "Final virial {}", _virialSum[0] + _virialSum[1] + _virialSum[2]);
795 }
796 }
797
803 if (not calculateGlobals) {
805 "Trying to get potential energy even though calculateGlobals is false. If you want this functor to calculate "
806 "global "
807 "values, please specify calculateGlobals to be true.");
808 }
809 if (not _postProcessed) {
811 "Cannot get potential energy, because endTraversal was not called.");
812 }
813 return _potentialEnergySum;
814 }
815
820 double getVirial() {
821 if (not calculateGlobals) {
823 "Trying to get virial even though calculateGlobals is false. If you want this functor to calculate global "
824 "values, please specify calculateGlobals to be true.");
825 }
826 if (not _postProcessed) {
828 "Cannot get virial, because endTraversal was not called.");
829 }
830 return _virialSum[0] + _virialSum[1] + _virialSum[2];
831 }
832
841 void setParticleProperties(double epsilon24, double sigmaSquared) {
842#ifdef __ARM_FEATURE_SVE
843 _epsilon24 = epsilon24;
844 _sigmaSquared = sigmaSquared;
845 if constexpr (applyShift) {
846 _shift6 = ParticlePropertiesLibrary<double, size_t>::calcShift6(epsilon24, sigmaSquared, _cutoffSquared);
847 } else {
848 _shift6 = 0.0;
849 }
850#endif
851
852 _epsilon24AoS = epsilon24;
853 _sigmaSquaredAoS = sigmaSquared;
854 if constexpr (applyShift) {
855 _shift6AoS = ParticlePropertiesLibrary<double, size_t>::calcShift6(epsilon24, sigmaSquared, _cutoffSquaredAoS);
856 } else {
857 _shift6AoS = 0.;
858 }
859 }
860
861 private:
865 class AoSThreadData {
866 public:
867 AoSThreadData() : virialSum{0., 0., 0.}, potentialEnergySum{0.}, __remainingTo64{} {}
868 void setZero() {
869 virialSum = {0., 0., 0.};
870 potentialEnergySum = 0.;
871 }
872
873 // variables
874 std::array<double, 3> virialSum;
875 double potentialEnergySum;
876
877 private:
878 // dummy parameter to get the right size (64 bytes)
879 double __remainingTo64[(64 - 4 * sizeof(double)) / sizeof(double)];
880 };
881 // make sure of the size of AoSThreadData
882 static_assert(sizeof(AoSThreadData) % 64 == 0, "AoSThreadData has wrong size");
883
884#ifdef __ARM_FEATURE_SVE
885 const double _cutoffSquared{};
886 double _shift6{0.};
887 double _epsilon24{0.};
888 double _sigmaSquared{0.};
889#endif
890
891 const double _cutoffSquaredAoS;
892 double _epsilon24AoS{0.}, _sigmaSquaredAoS{0.}, _shift6AoS{0.};
893
894 ParticlePropertiesLibrary<double, size_t> *_PPLibrary = nullptr;
895
896 // sum of the potential energy, only calculated if calculateGlobals is true
897 double _potentialEnergySum{0.};
898
899 // sum of the virial, only calculated if calculateGlobals is true
900 std::array<double, 3> _virialSum{0., 0., 0.};
901
902 // thread buffer for aos
903 std::vector<AoSThreadData> _aosThreadData{};
904
905 // defines whether or whether not the global values are already preprocessed
906 bool _postProcessed{false};
907};
908} // namespace mdLib
#define AutoPasLog(lvl, fmt,...)
Macro for logging providing common meta information without filename.
Definition: Logger.h:24
This class stores the (physical) properties of molecule types, and, in the case of multi-site molecul...
Definition: ParticlePropertiesLibrary.h:28
floatType getMixingShift6(intType i, intType j) const
Returns precomputed mixed shift * 6 for one pair of site types.
Definition: ParticlePropertiesLibrary.h:262
floatType getMixingSigmaSquared(intType i, intType j) const
Returns precomputed mixed squared sigma for one pair of site types.
Definition: ParticlePropertiesLibrary.h:252
const double * getLJMixingDataPtr(intType i, intType j)
Get a pointer to Mixing Data for one pair of LJ site types.
Definition: ParticlePropertiesLibrary.h:242
floatType getMixing24Epsilon(intType i, intType j) const
Returns the precomputed mixed epsilon * 24.
Definition: ParticlePropertiesLibrary.h:224
static double calcShift6(double epsilon24, double sigmaSquared, double cutoffSquared)
Calculate the shift multiplied 6 of the lennard jones potential from given cutoff,...
Definition: ParticlePropertiesLibrary.h:576
AlignedAllocator class.
Definition: AlignedAllocator.h:29
PairwiseFunctor class.
Definition: PairwiseFunctor.h:31
View on a fixed part of a SoA between a start index and an end index.
Definition: SoAView.h:23
size_t size() const
Returns the number of particles in the view.
Definition: SoAView.h:83
Default exception class for autopas exceptions.
Definition: ExceptionHandler.h:116
static void exception(const Exception e)
Handle an exception derived by std::exception.
Definition: ExceptionHandler.h:64
A functor to handle lennard-jones interactions between two particles (molecules).
Definition: LJFunctorSVE.h:51
void initTraversal() final
Reset the global values.
Definition: LJFunctorSVE.h:759
double getVirial()
Get the virial.
Definition: LJFunctorSVE.h:820
bool isRelevantForTuning() final
Specifies whether the functor should be considered for the auto-tuning process.
Definition: LJFunctorSVE.h:120
void SoAFunctorPair(autopas::SoAView< SoAArraysType > soa1, autopas::SoAView< SoAArraysType > soa2, const bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorSVE.h:203
static constexpr bool getMixing()
Definition: LJFunctorSVE.h:735
LJFunctorSVE()=delete
Deleted default constructor.
static unsigned long getNumFlopsPerKernelCall(size_t molAType, size_t molBType, bool newton3)
Get the number of flops used per kernel call for a given particle pair.
Definition: LJFunctorSVE.h:748
LJFunctorSVE(double cutoff)
Constructor for Functor with mixing disabled.
Definition: LJFunctorSVE.h:98
static constexpr auto getNeededAttr()
Get attributes needed for computation.
Definition: LJFunctorSVE.h:701
void endTraversal(bool newton3) final
Accumulates global values, e.g.
Definition: LJFunctorSVE.h:772
static constexpr auto getNeededAttr(std::false_type)
Get attributes needed for computation without N3 optimization.
Definition: LJFunctorSVE.h:716
void setParticleProperties(double epsilon24, double sigmaSquared)
Sets the particle properties constants for this functor.
Definition: LJFunctorSVE.h:841
std::string getName() final
Returns name of functor.
Definition: LJFunctorSVE.h:118
double getPotentialEnergy()
Get the potential Energy.
Definition: LJFunctorSVE.h:802
LJFunctorSVE(double cutoff, ParticlePropertiesLibrary< double, size_t > &particlePropertiesLibrary)
Constructor for Functor with mixing active.
Definition: LJFunctorSVE.h:110
bool allowsNonNewton3() final
Specifies whether the functor is capable of non-Newton3-like functors.
Definition: LJFunctorSVE.h:126
void SoAFunctorSingle(autopas::SoAView< SoAArraysType > soa, bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorSVE.h:190
bool allowsNewton3() final
Specifies whether the functor is capable of Newton3-like functors.
Definition: LJFunctorSVE.h:122
static constexpr auto getComputedAttr()
Get attributes computed by this functor.
Definition: LJFunctorSVE.h:726
void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final
PairwiseFunctor for arrays of structures (AoS).
Definition: LJFunctorSVE.h:130
void SoAFunctorVerlet(autopas::SoAView< SoAArraysType > soa, const size_t indexFirst, const std::vector< size_t, autopas::AlignedAllocator< size_t > > &neighborList, bool newton3) final
PairwiseFunctor for structure of arrays (SoA) for neighbor lists.
Definition: LJFunctorSVE.h:607
constexpr T dot(const std::array< T, SIZE > &a, const std::array< T, SIZE > &b)
Generates the dot product of two arrays.
Definition: ArrayMath.h:233
int autopas_get_max_threads()
Dummy for omp_get_max_threads() when no OpenMP is available.
Definition: WrapOpenMP.h:144
@ dummy
Dummy or deleted state, a particle with this state is not an actual particle!
@ owned
Owned state, a particle with this state is an actual particle and owned by the current AutoPas object...
FunctorN3Modes
Newton 3 modes for the Functor.
Definition: Functor.h:23
int autopas_get_thread_num()
Dummy for omp_set_lock() when no OpenMP is available.
Definition: WrapOpenMP.h:132