AutoPas  3.0.0
Loading...
Searching...
No Matches
LJFunctorHWY.h
Go to the documentation of this file.
1
8#pragma once
9
10#include <hwy/highway.h>
11
12#include <optional>
13
21
22namespace mdLib {
23
24namespace highway = hwy::HWY_NAMESPACE;
26constexpr highway::ScalableTag<double> tag_double;
28constexpr highway::ScalableTag<int64_t> tag_long;
32HWY_LANES_CONSTEXPR inline size_t _vecLengthDouble{highway::Lanes(tag_double)};
36constexpr size_t _maxVecLengthDouble{highway::MaxLanes(tag_double)};
38using VectorDouble = decltype(highway::Zero(tag_double));
40using VectorLong = decltype(highway::Zero(tag_long));
42constexpr highway::Half<highway::DFromV<VectorDouble>> tag_double_half;
44using MaskDouble = decltype(highway::FirstN(tag_double, 1));
46using MaskLong = decltype(highway::FirstN(tag_long, 2));
48using VectorizationPattern = autopas::VectorizationPatternOption::Value;
49
61template <class Particle_T, bool applyShift = false, bool useMixing = false,
62 autopas::FunctorN3Modes useNewton3 = autopas::FunctorN3Modes::Both, bool calculateGlobals = false,
63 bool countFLOPs = false, bool relevantForTuning = true>
64
66 : public autopas::PairwiseFunctor<Particle_T, LJFunctorHWY<Particle_T, applyShift, useMixing, useNewton3,
67 calculateGlobals, countFLOPs, relevantForTuning>> {
68 using SoAArraysType = Particle_T::SoAArraysType;
69
70 public:
74 LJFunctorHWY() = delete;
75
82 explicit LJFunctorHWY(double cutoff, std::optional<std::reference_wrapper<ParticlePropertiesLibrary<double, size_t>>>
83 particlePropertiesLibrary = std::nullopt)
84 : autopas::PairwiseFunctor<Particle_T, LJFunctorHWY>(cutoff),
85 _cutoffSquareAoS{cutoff * cutoff},
86 _PPLibrary{particlePropertiesLibrary} {
87 if (calculateGlobals) {
88 _aosThreadData.resize(autopas::autopas_get_max_threads());
89 }
90
91 if constexpr (countFLOPs) {
92 AutoPasLog(DEBUG, "Using LJFunctorHWY with countFLOPs but FLOP counting is not implemented.");
93 }
94
95 if constexpr (useMixing) {
96 if (not _PPLibrary.has_value()) {
97 throw std::runtime_error("Mixing is enabled but no ParticlePropertiesLibrary was provided!");
98 }
99 } else {
100 if (_PPLibrary.has_value()) {
101 throw std::runtime_error("Mixing is disabled but a ParticlePropertiesLibrary was provided!");
102 }
103 }
104 }
105
106 std::string getName() final { return "LJFunctorHWY"; }
107
108 bool isRelevantForTuning() final { return relevantForTuning; }
109
110 bool allowsNewton3() final {
111 return useNewton3 == autopas::FunctorN3Modes::Newton3Only or useNewton3 == autopas::FunctorN3Modes::Both;
112 }
113
114 bool allowsNonNewton3() final {
115 return useNewton3 == autopas::FunctorN3Modes::Newton3Off or useNewton3 == autopas::FunctorN3Modes::Both;
116 }
117
125 bool isVecPatternAllowed(const VectorizationPattern vecPattern) final {
126 return std::ranges::find(_vecPatternsAllowed, vecPattern) != _vecPatternsAllowed.end();
127 }
128
132 inline void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final {
133 using namespace autopas::utils::ArrayMath::literals;
134 if (i.isDummy() or j.isDummy()) {
135 return;
136 }
137 auto sigmaSquare = _sigmaSquareAoS;
138 auto epsilon24 = _epsilon24AoS;
139 auto shift6 = _shift6AoS;
140 if constexpr (useMixing) {
141 sigmaSquare = _PPLibrary->get().getMixingSigmaSquared(i.getTypeId(), j.getTypeId());
142 epsilon24 = _PPLibrary->get().getMixing24Epsilon(i.getTypeId(), j.getTypeId());
143 if constexpr (applyShift) {
144 shift6 = _PPLibrary->get().getMixingShift6(i.getTypeId(), j.getTypeId());
145 }
146 }
147 const auto dr = i.getR() - j.getR();
148 const double dr2 = autopas::utils::ArrayMath::dot(dr, dr);
149
150 if (dr2 > _cutoffSquareAoS) {
151 return;
152 }
153
154 const double invdr2 = 1. / dr2;
155 double lj6 = sigmaSquare * invdr2;
156 lj6 = lj6 * lj6 * lj6;
157 const double lj12 = lj6 * lj6;
158 const double lj12m6 = lj12 - lj6;
159 const double fac = epsilon24 * (lj12 + lj12m6) * invdr2;
160 const auto f = dr * fac;
161 i.addF(f);
162 if (newton3) {
163 // only if we use newton 3 here, we want to
164 j.subF(f);
165 }
166 if (calculateGlobals) {
167 const auto virial = dr * f;
168 const double potentialEnergy6 = epsilon24 * lj12m6 + shift6;
169
170 const int threadnum = autopas::autopas_get_thread_num();
171
172 if (i.isOwned()) {
173 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
174 _aosThreadData[threadnum].virialSum += virial;
175 }
176 // for non-newton3 the second particle will be considered in a separate calculation
177 if (newton3 and j.isOwned()) {
178 // for non-newton3 the division is in the post-processing step.
179 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
180 _aosThreadData[threadnum].virialSum += virial;
181 }
182 }
183 }
184
189 inline void SoAFunctorSingle(autopas::SoAView<SoAArraysType> soa, const bool newton3) final {
190 if (soa.size() == 0) return;
191
192 // obtain iterators for the various values
193 const auto *const __restrict xPtr = soa.template begin<Particle_T::AttributeNames::posX>();
194 const auto *const __restrict yPtr = soa.template begin<Particle_T::AttributeNames::posY>();
195 const auto *const __restrict zPtr = soa.template begin<Particle_T::AttributeNames::posZ>();
196
197 const auto *const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
198
199 auto *const __restrict fxPtr = soa.template begin<Particle_T::AttributeNames::forceX>();
200 auto *const __restrict fyPtr = soa.template begin<Particle_T::AttributeNames::forceY>();
201 auto *const __restrict fzPtr = soa.template begin<Particle_T::AttributeNames::forceZ>();
202
203 const auto *const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
204
205 // initialize and declare vector variables
206 auto virialSumX = highway::Zero(tag_double);
207 auto virialSumY = highway::Zero(tag_double);
208 auto virialSumZ = highway::Zero(tag_double);
209 auto uPotSum = highway::Zero(tag_double);
210
211 for (std::ptrdiff_t i = static_cast<std::ptrdiff_t>(soa.size()) - 1;
212 checkFirstLoopCondition<true, VectorizationPattern::p1xVec>(i, 0);
213 decrementFirstLoop<VectorizationPattern::p1xVec>(i)) {
214 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
215 "OwnershipStates underlying type should be int64_t!");
216
217 handleILoopBody<true, true, false, VectorizationPattern::p1xVec>(
218 i, xPtr, yPtr, zPtr, ownedStatePtr, xPtr, yPtr, zPtr, ownedStatePtr, fxPtr, fyPtr, fzPtr, fxPtr, fyPtr, fzPtr,
219 typeIDptr, typeIDptr, virialSumX, virialSumY, virialSumZ, uPotSum, 0, i);
220 }
221
222 if constexpr (calculateGlobals) {
223 computeGlobals(virialSumX, virialSumY, virialSumZ, uPotSum);
224 }
225 }
226
227 // clang-format off
231 // clang-format on
233 bool newton3) final {
234 switch (_vecPattern) {
235 case VectorizationPattern::p1xVec: {
236 if (newton3) {
237 SoAFunctorPairImpl<true, VectorizationPattern::p1xVec>(soa1, soa2);
238 } else {
239 SoAFunctorPairImpl<false, VectorizationPattern::p1xVec>(soa1, soa2);
240 }
241 break;
242 }
243 case VectorizationPattern::p2xVecDiv2: {
244 if (newton3) {
245 SoAFunctorPairImpl<true, VectorizationPattern::p2xVecDiv2>(soa1, soa2);
246 } else {
247 SoAFunctorPairImpl<false, VectorizationPattern::p2xVecDiv2>(soa1, soa2);
248 }
249 break;
250 }
251 case VectorizationPattern::pVecDiv2x2: {
252 if (newton3) {
253 SoAFunctorPairImpl<true, VectorizationPattern::pVecDiv2x2>(soa1, soa2);
254 } else {
255 SoAFunctorPairImpl<false, VectorizationPattern::pVecDiv2x2>(soa1, soa2);
256 }
257 break;
258 }
259 case VectorizationPattern::pVecx1: {
260 if (newton3) {
261 SoAFunctorPairImpl<true, VectorizationPattern::pVecx1>(soa1, soa2);
262 } else {
263 SoAFunctorPairImpl<false, VectorizationPattern::pVecx1>(soa1, soa2);
264 }
265 break;
266 }
267 default:
268 autopas::utils::ExceptionHandler::exception("Unknown VectorizationPattern!");
269 }
270 }
271
272 private:
282 template <bool reversed, VectorizationPattern vecPattern>
283 static constexpr bool checkFirstLoopCondition(const std::ptrdiff_t i, const long vecEnd) {
284 if constexpr (reversed) {
285 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
286 return i >= 0;
287 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
288 return i >= 1;
289 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
290 return i >= _vecLengthDouble / 2;
291 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
292 return i >= _vecLengthDouble;
293 } else {
294 return false;
295 }
296 } else {
297 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
298 return i < vecEnd;
299 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
300 return i < (vecEnd - 1);
301 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
302 // Switching to signed long to avoid integer underflows.
303 // Note: The narrowing conversion of _vecLengthDouble shouldn't cause issues as it is not expected to be so
304 // large.
305 const long criterion = vecEnd - _vecLengthDouble / 2 + 1;
306 return i < criterion;
307 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
308 // Switching to signed long to avoid integer underflows.
309 // Note: The narrowing conversion of _vecLengthDouble shouldn't cause issues as it is not expected to be so
310 // large.
311 const long criterion = vecEnd - _vecLengthDouble + 1;
312 return i < criterion;
313 } else {
314 return false;
315 }
316 }
317 }
318
326 template <VectorizationPattern vecPattern>
327 static constexpr void decrementFirstLoop(std::ptrdiff_t &i) {
328 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
329 --i;
330 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
331 i -= 2;
332 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
333 i -= _vecLengthDouble / 2;
334 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
335 i -= _vecLengthDouble;
336 }
337 }
338
346 template <VectorizationPattern vecPattern>
347 static constexpr void incrementFirstLoop(std::ptrdiff_t &i) {
348 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
349 ++i;
350 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
351 i += 2;
352 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
353 i += _vecLengthDouble / 2;
354 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
355 i += _vecLengthDouble;
356 }
357 }
358
368 template <bool reversed>
369 static constexpr int obtainILoopRemainderLength(std::ptrdiff_t i, const int vecEnd) {
370 return reversed ? (i < 0 ? 0 : i + 1) : vecEnd - i;
371 }
372
381 template <VectorizationPattern vecPattern>
382 static constexpr bool checkSecondLoopCondition(std::ptrdiff_t i, size_t j) {
383 std::ptrdiff_t limit = 0;
384
385 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
386 // Round i down to the next multiple of _vecLengthDouble
387 limit = i - (i % _vecLengthDouble);
388 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
389 // Round i down to the next multiple of _vecLengthDouble / 2
390 const std::ptrdiff_t block = _vecLengthDouble / 2;
391 limit = i - (i % block);
392 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
393 // Round i down to the next multiple of 2
394 limit = i - (i % 2);
395 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
396 // Rounding to a multiple of 1 is a no-op
397 limit = i;
398 } else {
399 // Unknown vectorization pattern
400 return false;
401 }
402
403 return j < static_cast<size_t>(limit);
404 }
405
413 template <VectorizationPattern vecPattern>
414 static constexpr void incrementSecondLoop(std::ptrdiff_t &j) {
415 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
416 j += _vecLengthDouble;
417 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
418 j += _vecLengthDouble / 2;
419 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
420 j += 2;
421 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
422 ++j;
423 }
424 }
425
434 template <VectorizationPattern vecPattern>
435 static constexpr int obtainJLoopRemainderLength(const std::ptrdiff_t j) {
436 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
437 return static_cast<int>(j & (_vecLengthDouble - 1));
438 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
439 return static_cast<int>(j & (_vecLengthDouble / 2 - 1));
440 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
441 return static_cast<int>(j & (1));
442 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
443 return 0;
444 } else {
445 return -1;
446 }
447 }
448
467 template <bool remainder, bool reversed, VectorizationPattern vecPattern>
468 static void fillIRegisters(const size_t i, const double *const __restrict xPtr, const double *const __restrict yPtr,
469 const double *const __restrict zPtr,
470 const autopas::OwnershipState *const __restrict ownedStatePtr, VectorDouble &x1,
471 VectorDouble &y1, VectorDouble &z1, MaskDouble &ownedMaskI, const size_t restI) {
472 VectorLong ownedStateILong = highway::Zero(tag_long);
473
474 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
475 const auto owned = static_cast<int64_t>(ownedStatePtr[i]);
476 ownedStateILong = highway::Set(tag_long, owned);
477
478 x1 = highway::Set(tag_double, xPtr[i]);
479 y1 = highway::Set(tag_double, yPtr[i]);
480 z1 = highway::Set(tag_double, zPtr[i]);
481 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
482 const auto ownedFirst = static_cast<int64_t>(ownedStatePtr[i]);
483 ownedStateILong = highway::Set(tag_long, ownedFirst);
484
485 x1 = highway::Set(tag_double, xPtr[i]);
486 y1 = highway::Set(tag_double, yPtr[i]);
487 z1 = highway::Set(tag_double, zPtr[i]);
488
489 VectorLong tmpOwnedI = highway::Zero(tag_long);
490 VectorDouble tmpX1 = highway::Zero(tag_double);
491 VectorDouble tmpY1 = highway::Zero(tag_double);
492 VectorDouble tmpZ1 = highway::Zero(tag_double);
493
494 if constexpr (not remainder) {
495 const auto index = reversed ? i - 1 : i + 1;
496 const auto ownedSecond = static_cast<int64_t>(ownedStatePtr[index]);
497 tmpOwnedI = highway::Set(tag_long, ownedSecond);
498 tmpX1 = highway::Set(tag_double, xPtr[index]);
499 tmpY1 = highway::Set(tag_double, yPtr[index]);
500 tmpZ1 = highway::Set(tag_double, zPtr[index]);
501 }
502
503 ownedStateILong = highway::ConcatLowerLower(tag_long, tmpOwnedI, ownedStateILong);
504 x1 = highway::ConcatLowerLower(tag_double, tmpX1, x1);
505 y1 = highway::ConcatLowerLower(tag_double, tmpY1, y1);
506 z1 = highway::ConcatLowerLower(tag_double, tmpZ1, z1);
507 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
508 const int index = reversed ? (remainder ? 0 : i - _vecLengthDouble / 2 + 1) : i;
509 const int lanes = remainder ? restI : _vecLengthDouble / 2;
510
511 ownedStateILong = highway::LoadN(tag_long, reinterpret_cast<const int64_t *>(&ownedStatePtr[index]), lanes);
512
513 x1 = highway::LoadN(tag_double, &xPtr[index], lanes);
514 y1 = highway::LoadN(tag_double, &yPtr[index], lanes);
515 z1 = highway::LoadN(tag_double, &zPtr[index], lanes);
516
517 ownedStateILong = highway::ConcatLowerLower(tag_long, ownedStateILong, ownedStateILong);
518 x1 = highway::ConcatLowerLower(tag_double, x1, x1);
519 y1 = highway::ConcatLowerLower(tag_double, y1, y1);
520 z1 = highway::ConcatLowerLower(tag_double, z1, z1);
521 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
522 const auto index = reversed ? (remainder ? 0 : i - _vecLengthDouble + 1) : i;
523
524 if constexpr (remainder) {
525 x1 = highway::LoadN(tag_double, &xPtr[index], restI);
526 y1 = highway::LoadN(tag_double, &yPtr[index], restI);
527 z1 = highway::LoadN(tag_double, &zPtr[index], restI);
528
529 ownedStateILong = highway::LoadN(tag_long, reinterpret_cast<const int64_t *>(&ownedStatePtr[index]), restI);
530 } else {
531 x1 = highway::LoadU(tag_double, &xPtr[index]);
532 y1 = highway::LoadU(tag_double, &yPtr[index]);
533 z1 = highway::LoadU(tag_double, &zPtr[index]);
534
535 ownedStateILong = highway::LoadU(tag_long, reinterpret_cast<const int64_t *>(&ownedStatePtr[index]));
536 }
537 }
538
539 MaskLong ownedMaskILong = highway::Ne(ownedStateILong, highway::Zero(tag_long));
540
541 // convert to a double mask since we perform logical operations with other double masks in the kernel.
542 ownedMaskI = highway::RebindMask(tag_double, ownedMaskILong);
543 }
544
545 template <bool remainder, VectorizationPattern vecPattern>
546 static void handleNewton3Reduction(const VectorDouble &fx, const VectorDouble &fy, const VectorDouble &fz,
547 double *const __restrict fx2Ptr, double *const __restrict fy2Ptr,
548 double *const __restrict fz2Ptr, const size_t j, const size_t rest) {
549 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
550 const VectorDouble fx2 =
551 remainder ? highway::LoadN(tag_double, &fx2Ptr[j], rest) : highway::LoadU(tag_double, &fx2Ptr[j]);
552 const VectorDouble fy2 =
553 remainder ? highway::LoadN(tag_double, &fy2Ptr[j], rest) : highway::LoadU(tag_double, &fy2Ptr[j]);
554 const VectorDouble fz2 =
555 remainder ? highway::LoadN(tag_double, &fz2Ptr[j], rest) : highway::LoadU(tag_double, &fz2Ptr[j]);
556
557 const VectorDouble fx2New = highway::Sub(fx2, fx);
558 const VectorDouble fy2New = highway::Sub(fy2, fy);
559 const VectorDouble fz2New = highway::Sub(fz2, fz);
560
561 remainder ? highway::StoreN(fx2New, tag_double, &fx2Ptr[j], rest)
562 : highway::StoreU(fx2New, tag_double, &fx2Ptr[j]);
563 remainder ? highway::StoreN(fy2New, tag_double, &fy2Ptr[j], rest)
564 : highway::StoreU(fy2New, tag_double, &fy2Ptr[j]);
565 remainder ? highway::StoreN(fz2New, tag_double, &fz2Ptr[j], rest)
566 : highway::StoreU(fz2New, tag_double, &fz2Ptr[j]);
567 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
568 const auto lowerFx = highway::LowerHalf(tag_double_half, fx);
569 const auto lowerFy = highway::LowerHalf(tag_double_half, fy);
570 const auto lowerFz = highway::LowerHalf(tag_double_half, fz);
571
572 const auto upperFx = highway::UpperHalf(tag_double_half, fx);
573 const auto upperFy = highway::UpperHalf(tag_double_half, fy);
574 const auto upperFz = highway::UpperHalf(tag_double_half, fz);
575
576 const auto fxCombined = highway::Add(lowerFx, upperFx);
577 const auto fyCombined = highway::Add(lowerFy, upperFy);
578 const auto fzCombined = highway::Add(lowerFz, upperFz);
579
580 const int lanes = remainder ? rest : _vecLengthDouble / 2;
581
582 const auto fx2 = highway::LoadN(tag_double_half, &fx2Ptr[j], lanes);
583 const auto fy2 = highway::LoadN(tag_double_half, &fy2Ptr[j], lanes);
584 const auto fz2 = highway::LoadN(tag_double_half, &fz2Ptr[j], lanes);
585
586 const auto newFx = highway::Sub(fx2, fxCombined);
587 const auto newFy = highway::Sub(fy2, fyCombined);
588 const auto newFz = highway::Sub(fz2, fzCombined);
589
590 highway::StoreN(newFx, tag_double_half, &fx2Ptr[j], lanes);
591 highway::StoreN(newFy, tag_double_half, &fy2Ptr[j], lanes);
592 highway::StoreN(newFz, tag_double_half, &fz2Ptr[j], lanes);
593 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
594 const auto lowerFx = highway::LowerHalf(tag_double_half, fx);
595 const auto lowerFy = highway::LowerHalf(tag_double_half, fy);
596 const auto lowerFz = highway::LowerHalf(tag_double_half, fz);
597
598 fx2Ptr[j] -= highway::ReduceSum(tag_double_half, lowerFx);
599 fy2Ptr[j] -= highway::ReduceSum(tag_double_half, lowerFy);
600 fz2Ptr[j] -= highway::ReduceSum(tag_double_half, lowerFz);
601
602 if constexpr (not remainder) {
603 const auto upperFx = highway::UpperHalf(tag_double_half, fx);
604 const auto upperFy = highway::UpperHalf(tag_double_half, fy);
605 const auto upperFz = highway::UpperHalf(tag_double_half, fz);
606
607 fx2Ptr[j + 1] -= highway::ReduceSum(tag_double_half, upperFx);
608 fy2Ptr[j + 1] -= highway::ReduceSum(tag_double_half, upperFy);
609 fz2Ptr[j + 1] -= highway::ReduceSum(tag_double_half, upperFz);
610 }
611 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
612 fx2Ptr[j] -= highway::ReduceSum(tag_double, fx);
613 fy2Ptr[j] -= highway::ReduceSum(tag_double, fy);
614 fz2Ptr[j] -= highway::ReduceSum(tag_double, fz);
615 }
616 }
617
618 template <bool reversed, bool remainder, VectorizationPattern vecPattern>
619 static void reduceAccumulatedForce(const size_t i, double *const __restrict fxPtr, double *const __restrict fyPtr,
620 double *const __restrict fzPtr, const VectorDouble &fxAcc,
621 const VectorDouble &fyAcc, const VectorDouble &fzAcc, const int restI) {
622 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
623 fxPtr[i] += highway::ReduceSum(tag_double, fxAcc);
624 fyPtr[i] += highway::ReduceSum(tag_double, fyAcc);
625 fzPtr[i] += highway::ReduceSum(tag_double, fzAcc);
626 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
627 const auto lowerFxAcc = highway::LowerHalf(tag_double_half, fxAcc);
628 const auto lowerFyAcc = highway::LowerHalf(tag_double_half, fyAcc);
629 const auto lowerFzAcc = highway::LowerHalf(tag_double_half, fzAcc);
630
631 fxPtr[i] += highway::ReduceSum(tag_double_half, lowerFxAcc);
632 fyPtr[i] += highway::ReduceSum(tag_double_half, lowerFyAcc);
633 fzPtr[i] += highway::ReduceSum(tag_double_half, lowerFzAcc);
634
635 if constexpr (not remainder) {
636 const auto upperFxAcc = highway::UpperHalf(tag_double_half, fxAcc);
637 const auto upperFyAcc = highway::UpperHalf(tag_double_half, fyAcc);
638 const auto upperFzAcc = highway::UpperHalf(tag_double_half, fzAcc);
639
640 const auto index = reversed ? i - 1 : i + 1;
641 fxPtr[index] += highway::ReduceSum(tag_double_half, upperFxAcc);
642 fyPtr[index] += highway::ReduceSum(tag_double_half, upperFyAcc);
643 fzPtr[index] += highway::ReduceSum(tag_double_half, upperFzAcc);
644 }
645 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
646 const auto lowerFxAcc = highway::LowerHalf(tag_double_half, fxAcc);
647 const auto lowerFyAcc = highway::LowerHalf(tag_double_half, fyAcc);
648 const auto lowerFzAcc = highway::LowerHalf(tag_double_half, fzAcc);
649
650 const auto upperFxAcc = highway::UpperHalf(tag_double_half, fxAcc);
651 const auto upperFyAcc = highway::UpperHalf(tag_double_half, fyAcc);
652 const auto upperFzAcc = highway::UpperHalf(tag_double_half, fzAcc);
653
654 const auto fxAccCombined = highway::Add(lowerFxAcc, upperFxAcc);
655 const auto fyAccCombined = highway::Add(lowerFyAcc, upperFyAcc);
656 const auto fzAccCombined = highway::Add(lowerFzAcc, upperFzAcc);
657
658 const int index = reversed ? (remainder ? 0 : i - _vecLengthDouble / 2 + 1) : i;
659
660 const int lanes = remainder ? restI : _vecLengthDouble / 2;
661
662 const auto oldFx = highway::LoadN(tag_double_half, &fxPtr[index], lanes);
663 const auto oldFy = highway::LoadN(tag_double_half, &fyPtr[index], lanes);
664 const auto oldFz = highway::LoadN(tag_double_half, &fzPtr[index], lanes);
665
666 const auto newFx = highway::Add(oldFx, fxAccCombined);
667 const auto newFy = highway::Add(oldFy, fyAccCombined);
668 const auto newFz = highway::Add(oldFz, fzAccCombined);
669
670 highway::StoreN(newFx, tag_double_half, &fxPtr[index], lanes);
671 highway::StoreN(newFy, tag_double_half, &fyPtr[index], lanes);
672 highway::StoreN(newFz, tag_double_half, &fzPtr[index], lanes);
673 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
674 const VectorDouble oldFx =
675 remainder ? highway::LoadN(tag_double, &fxPtr[i], restI) : highway::LoadU(tag_double, &fxPtr[i]);
676 const VectorDouble oldFy =
677 remainder ? highway::LoadN(tag_double, &fyPtr[i], restI) : highway::LoadU(tag_double, &fyPtr[i]);
678 const VectorDouble oldFz =
679 remainder ? highway::LoadN(tag_double, &fzPtr[i], restI) : highway::LoadU(tag_double, &fzPtr[i]);
680
681 const VectorDouble fxNew = highway::Add(oldFx, fxAcc);
682 const VectorDouble fyNew = highway::Add(oldFy, fyAcc);
683 const VectorDouble fzNew = highway::Add(oldFz, fzAcc);
684
685 remainder ? highway::StoreN(fxNew, tag_double, &fxPtr[i], restI) : highway::StoreU(fxNew, tag_double, &fxPtr[i]);
686 remainder ? highway::StoreN(fyNew, tag_double, &fyPtr[i], restI) : highway::StoreU(fyNew, tag_double, &fyPtr[i]);
687 remainder ? highway::StoreN(fzNew, tag_double, &fzPtr[i], restI) : highway::StoreU(fzNew, tag_double, &fzPtr[i]);
688 }
689 }
690
691 inline void computeGlobals(const VectorDouble &virialSumX, const VectorDouble &virialSumY,
692 const VectorDouble &virialSumZ, const VectorDouble &uPotSum) {
693 const int threadnum = autopas::autopas_get_thread_num();
694
695 _aosThreadData[threadnum].virialSum[0] += highway::ReduceSum(tag_double, virialSumX);
696 _aosThreadData[threadnum].virialSum[1] += highway::ReduceSum(tag_double, virialSumY);
697 _aosThreadData[threadnum].virialSum[2] += highway::ReduceSum(tag_double, virialSumZ);
698 _aosThreadData[threadnum].potentialEnergySum += highway::ReduceSum(tag_double, uPotSum);
699 }
700
734 template <bool reversed, bool newton3, bool remainderI, VectorizationPattern vecPattern>
735 inline void handleILoopBody(
736 const size_t i, const double *const __restrict xPtr1, const double *const __restrict yPtr1,
737 const double *const __restrict zPtr1, const autopas::OwnershipState *const __restrict ownedStatePtr1,
738 const double *const __restrict xPtr2, const double *const __restrict yPtr2, const double *const __restrict zPtr2,
739 const autopas::OwnershipState *const __restrict ownedStatePtr2, double *const __restrict fxPtr1,
740 double *const __restrict fyPtr1, double *const __restrict fzPtr1, double *const __restrict fxPtr2,
741 double *const __restrict fyPtr2, double *const __restrict fzPtr2, const size_t *const __restrict typeIDptr1,
742 const size_t *const __restrict typeIDptr2, VectorDouble &virialSumX, VectorDouble &virialSumY,
743 VectorDouble &virialSumZ, VectorDouble &uPotSum, const size_t restI, const size_t jVecEnd) {
744 VectorDouble fxAcc = highway::Zero(tag_double);
745 VectorDouble fyAcc = highway::Zero(tag_double);
746 VectorDouble fzAcc = highway::Zero(tag_double);
747
748 MaskDouble ownedMaskI;
749
750 VectorDouble x1 = highway::Zero(tag_double);
751 VectorDouble y1 = highway::Zero(tag_double);
752 VectorDouble z1 = highway::Zero(tag_double);
753
754 fillIRegisters<remainderI, reversed, vecPattern>(i, xPtr1, yPtr1, zPtr1, ownedStatePtr1, x1, y1, z1, ownedMaskI,
755 restI);
756
757 std::ptrdiff_t j = 0;
758 for (; checkSecondLoopCondition<vecPattern>(jVecEnd, j); incrementSecondLoop<vecPattern>(j)) {
759 SoAKernel<newton3, remainderI, false, reversed, vecPattern>(
760 i, j, ownedMaskI, reinterpret_cast<const int64_t *>(ownedStatePtr2), x1, y1, z1, xPtr2, yPtr2, zPtr2, fxPtr2,
761 fyPtr2, fzPtr2, &typeIDptr1[i], &typeIDptr2[j], fxAcc, fyAcc, fzAcc, virialSumX, virialSumY, virialSumZ,
762 uPotSum, restI, 0);
763 }
764
765 const int restJ = obtainJLoopRemainderLength<vecPattern>(jVecEnd);
766 if (restJ > 0) {
767 SoAKernel<newton3, remainderI, true, reversed, vecPattern>(
768 i, j, ownedMaskI, reinterpret_cast<const int64_t *>(ownedStatePtr2), x1, y1, z1, xPtr2, yPtr2, zPtr2, fxPtr2,
769 fyPtr2, fzPtr2, &typeIDptr1[i], &typeIDptr2[j], fxAcc, fyAcc, fzAcc, virialSumX, virialSumY, virialSumZ,
770 uPotSum, restI, restJ);
771 }
772
773 reduceAccumulatedForce<reversed, remainderI, vecPattern>(i, fxPtr1, fyPtr1, fzPtr1, fxAcc, fyAcc, fzAcc, restI);
774 }
775
783 template <bool newton3, VectorizationPattern vecPattern>
784 inline void SoAFunctorPairImpl(autopas::SoAView<SoAArraysType> soa1, autopas::SoAView<SoAArraysType> soa2) {
785 if (soa1.size() == 0 || soa2.size() == 0) {
786 return;
787 }
788
789 const auto *const __restrict x1Ptr = soa1.template begin<Particle_T::AttributeNames::posX>();
790 const auto *const __restrict y1Ptr = soa1.template begin<Particle_T::AttributeNames::posY>();
791 const auto *const __restrict z1Ptr = soa1.template begin<Particle_T::AttributeNames::posZ>();
792 const auto *const __restrict x2Ptr = soa2.template begin<Particle_T::AttributeNames::posX>();
793 const auto *const __restrict y2Ptr = soa2.template begin<Particle_T::AttributeNames::posY>();
794 const auto *const __restrict z2Ptr = soa2.template begin<Particle_T::AttributeNames::posZ>();
795
796 const auto *const __restrict ownedStatePtr1 = soa1.template begin<Particle_T::AttributeNames::ownershipState>();
797 const auto *const __restrict ownedStatePtr2 = soa2.template begin<Particle_T::AttributeNames::ownershipState>();
798
799 auto *const __restrict fx1Ptr = soa1.template begin<Particle_T::AttributeNames::forceX>();
800 auto *const __restrict fy1Ptr = soa1.template begin<Particle_T::AttributeNames::forceY>();
801 auto *const __restrict fz1Ptr = soa1.template begin<Particle_T::AttributeNames::forceZ>();
802 auto *const __restrict fx2Ptr = soa2.template begin<Particle_T::AttributeNames::forceX>();
803 auto *const __restrict fy2Ptr = soa2.template begin<Particle_T::AttributeNames::forceY>();
804 auto *const __restrict fz2Ptr = soa2.template begin<Particle_T::AttributeNames::forceZ>();
805
806 const auto *const __restrict typeID1ptr = soa1.template begin<Particle_T::AttributeNames::typeId>();
807 const auto *const __restrict typeID2ptr = soa2.template begin<Particle_T::AttributeNames::typeId>();
808
809 VectorDouble virialSumX = highway::Zero(tag_double);
810 VectorDouble virialSumY = highway::Zero(tag_double);
811 VectorDouble virialSumZ = highway::Zero(tag_double);
812 VectorDouble uPotSum = highway::Zero(tag_double);
813
814 std::ptrdiff_t i = 0;
815 for (; checkFirstLoopCondition<false, vecPattern>(i, soa1.size()); incrementFirstLoop<vecPattern>(i)) {
816 handleILoopBody<false, newton3, false, vecPattern>(
817 i, x1Ptr, y1Ptr, z1Ptr, ownedStatePtr1, x2Ptr, y2Ptr, z2Ptr, ownedStatePtr2, fx1Ptr, fy1Ptr, fz1Ptr, fx2Ptr,
818 fy2Ptr, fz2Ptr, typeID1ptr, typeID2ptr, virialSumX, virialSumY, virialSumZ, uPotSum, 0, soa2.size());
819 }
820
821 if constexpr (vecPattern != VectorizationPattern::p1xVec) {
822 // Rest I can't occur in 1xVec case
823 const int restI = obtainILoopRemainderLength<false>(i, soa1.size());
824 if (restI > 0) {
825 handleILoopBody<false, newton3, true, vecPattern>(
826 i, x1Ptr, y1Ptr, z1Ptr, ownedStatePtr1, x2Ptr, y2Ptr, z2Ptr, ownedStatePtr2, fx1Ptr, fy1Ptr, fz1Ptr, fx2Ptr,
827 fy2Ptr, fz2Ptr, typeID1ptr, typeID2ptr, virialSumX, virialSumY, virialSumZ, uPotSum, restI, soa2.size());
828 }
829 }
830
831 if constexpr (calculateGlobals) {
832 computeGlobals(virialSumX, virialSumY, virialSumZ, uPotSum);
833 }
834 }
835
853 template <bool remainder, VectorizationPattern vecPattern>
854 static void fillJRegisters(const size_t j, const double *const __restrict x2Ptr, const double *const __restrict y2Ptr,
855 const double *const __restrict z2Ptr, const int64_t *const __restrict ownedStatePtr2,
856 VectorDouble &x2, VectorDouble &y2, VectorDouble &z2, MaskDouble &ownedMaskJ,
857 const unsigned int rest) {
858 VectorLong ownedStateJLong = highway::Zero(tag_long);
859
860 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
861 if constexpr (remainder) {
862 x2 = highway::LoadN(tag_double, &x2Ptr[j], rest);
863 y2 = highway::LoadN(tag_double, &y2Ptr[j], rest);
864 z2 = highway::LoadN(tag_double, &z2Ptr[j], rest);
865
866 ownedStateJLong = highway::LoadN(tag_long, &ownedStatePtr2[j], rest);
867 } else {
868 x2 = highway::LoadU(tag_double, &x2Ptr[j]);
869 y2 = highway::LoadU(tag_double, &y2Ptr[j]);
870 z2 = highway::LoadU(tag_double, &z2Ptr[j]);
871
872 ownedStateJLong = highway::LoadU(tag_long, &ownedStatePtr2[j]);
873 }
874 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
875 const int lanes = remainder ? rest : _vecLengthDouble / 2;
876
877 VectorLong ownedStateJ = highway::LoadN(tag_long, &ownedStatePtr2[j], lanes);
878 x2 = highway::LoadN(tag_double, &x2Ptr[j], lanes);
879 y2 = highway::LoadN(tag_double, &y2Ptr[j], lanes);
880 z2 = highway::LoadN(tag_double, &z2Ptr[j], lanes);
881
882 // "broadcast" lower half to upper half
883 ownedStateJLong = highway::ConcatLowerLower(tag_long, ownedStateJ, ownedStateJ);
884 x2 = highway::ConcatLowerLower(tag_double, x2, x2);
885 y2 = highway::ConcatLowerLower(tag_double, y2, y2);
886 z2 = highway::ConcatLowerLower(tag_double, z2, z2);
887 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
888 VectorLong ownedStateJ = highway::Set(tag_long, ownedStatePtr2[j]);
889 x2 = highway::Set(tag_double, x2Ptr[j]);
890 y2 = highway::Set(tag_double, y2Ptr[j]);
891 z2 = highway::Set(tag_double, z2Ptr[j]);
892
893 if constexpr (remainder) {
894 ownedStateJLong = highway::ConcatLowerLower(tag_long, highway::Zero(tag_long), ownedStateJ);
895 x2 = highway::ConcatLowerLower(tag_double, highway::Zero(tag_double), x2);
896 y2 = highway::ConcatLowerLower(tag_double, highway::Zero(tag_double), y2);
897 z2 = highway::ConcatLowerLower(tag_double, highway::Zero(tag_double), z2);
898 } else {
899 const auto tmpOwnedJ = highway::Set(tag_long, ownedStatePtr2[j + 1]);
900 const auto tmpX2 = highway::Set(tag_double, x2Ptr[j + 1]);
901 const auto tmpY2 = highway::Set(tag_double, y2Ptr[j + 1]);
902 const auto tmpZ2 = highway::Set(tag_double, z2Ptr[j + 1]);
903
904 ownedStateJLong = highway::ConcatLowerLower(tag_long, tmpOwnedJ, ownedStateJ);
905 x2 = highway::ConcatLowerLower(tag_double, tmpX2, x2);
906 y2 = highway::ConcatLowerLower(tag_double, tmpY2, y2);
907 z2 = highway::ConcatLowerLower(tag_double, tmpZ2, z2);
908 }
909 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
910 ownedStateJLong = highway::Set(tag_long, ownedStatePtr2[j]);
911 x2 = highway::Set(tag_double, x2Ptr[j]);
912 y2 = highway::Set(tag_double, y2Ptr[j]);
913 z2 = highway::Set(tag_double, z2Ptr[j]);
914 }
915
916 MaskLong ownedMaskJLong = highway::Ne(ownedStateJLong, highway::Zero(tag_long));
917
918 // convert to a double mask since we perform logical operations with other double masks in the kernel.
919 ownedMaskJ = highway::RebindMask(tag_double, ownedMaskJLong);
920 }
921
922 template <bool remainderI, bool remainderJ, bool reversed, VectorizationPattern vecPattern>
923 inline void fillPhysicsRegisters(const size_t *const typeID1Ptr, const size_t *const typeID2Ptr,
924 VectorDouble &epsilon24s, VectorDouble &sigmaSquareds, VectorDouble &shift6s,
925 const unsigned int restI, const unsigned int restJ) const {
926 // We overestimate the array size. This should be a tight/perfect upper bound on x86, and should never be large
927 // enough on e.g. ARM/RISC-V to cause issues.
928 HWY_ALIGN std::array<double, _maxVecLengthDouble> epsilons{};
929 HWY_ALIGN std::array<double, _maxVecLengthDouble> sigmas{};
930 HWY_ALIGN std::array<double, _maxVecLengthDouble> shifts{};
931
932 if constexpr (vecPattern == VectorizationPattern::p1xVec) {
933 for (int j = 0; j < (remainderJ ? restJ : _vecLengthDouble); ++j) {
934 epsilons[j] = _PPLibrary->get().getMixing24Epsilon(*typeID1Ptr, *(typeID2Ptr + j));
935 sigmas[j] = _PPLibrary->get().getMixingSigmaSquared(*typeID1Ptr, *(typeID2Ptr + j));
936 if constexpr (applyShift) {
937 shifts[j] = _PPLibrary->get().getMixingShift6(*typeID1Ptr, *(typeID2Ptr + j));
938 }
939 }
940 } else if constexpr (vecPattern == VectorizationPattern::p2xVecDiv2) {
941 for (int i = 0; i < (remainderI ? 1 : 2); ++i) {
942 for (int j = 0; j < (remainderJ ? restJ : _vecLengthDouble / 2); ++j) {
943 const auto index = i * (_vecLengthDouble / 2) + j;
944 const auto typeID1 = reversed ? typeID1Ptr - i : typeID1Ptr + i;
945 epsilons[index] = _PPLibrary->get().getMixing24Epsilon(*typeID1, *(typeID2Ptr + j));
946 sigmas[index] = _PPLibrary->get().getMixingSigmaSquared(*typeID1, *(typeID2Ptr + j));
947
948 if constexpr (applyShift) {
949 shifts[index] = _PPLibrary->get().getMixingShift6(*typeID1, *(typeID2Ptr + j));
950 }
951 }
952 }
953 } else if constexpr (vecPattern == VectorizationPattern::pVecDiv2x2) {
954 for (int i = 0; i < (remainderI ? restI : _vecLengthDouble / 2); ++i) {
955 for (int j = 0; j < (remainderJ ? 1 : 2); ++j) {
956 const auto index = i + _vecLengthDouble / 2 * j;
957 const auto typeID1 = reversed ? typeID1Ptr - i : typeID1Ptr + i;
958
959 epsilons[index] = _PPLibrary->get().getMixing24Epsilon(*typeID1, *(typeID2Ptr + j));
960 sigmas[index] = _PPLibrary->get().getMixingSigmaSquared(*typeID1, *(typeID2Ptr + j));
961
962 if constexpr (applyShift) {
963 shifts[index] = _PPLibrary->get().getMixingShift6(*typeID1, *(typeID2Ptr + j));
964 }
965 }
966 }
967 } else if constexpr (vecPattern == VectorizationPattern::pVecx1) {
968 for (int i = 0; i < (remainderI ? restI : _vecLengthDouble); ++i) {
969 auto typeID1 = reversed ? typeID1Ptr - i : typeID1Ptr + i;
970 epsilons[i] = _PPLibrary->get().getMixing24Epsilon(*typeID1, *typeID2Ptr);
971 sigmas[i] = _PPLibrary->get().getMixingSigmaSquared(*typeID1, *typeID2Ptr);
972
973 if constexpr (applyShift) {
974 shifts[i] = _PPLibrary->get().getMixingShift6(*typeID1, *typeID2Ptr);
975 }
976 }
977 }
978
979 epsilon24s = highway::Load(tag_double, epsilons.data());
980 sigmaSquareds = highway::Load(tag_double, sigmas.data());
981 if constexpr (applyShift) {
982 shift6s = highway::Load(tag_double, shifts.data());
983 }
984 }
985
1019 template <bool newton3, bool remainderI, bool remainderJ, bool reversed, VectorizationPattern vecPattern>
1020 inline void SoAKernel(const size_t i, const size_t j, const MaskDouble &ownedMaskI,
1021 const int64_t *const __restrict ownedStatePtr2, const VectorDouble &x1, const VectorDouble &y1,
1022 const VectorDouble &z1, const double *const __restrict x2Ptr,
1023 const double *const __restrict y2Ptr, const double *const __restrict z2Ptr,
1024 double *const __restrict fx2Ptr, double *const __restrict fy2Ptr,
1025 double *const __restrict fz2Ptr, const size_t *const typeID1Ptr, const size_t *const typeID2Ptr,
1026 VectorDouble &fxAcc, VectorDouble &fyAcc, VectorDouble &fzAcc, VectorDouble &virialSumX,
1027 VectorDouble &virialSumY, VectorDouble &virialSumZ, VectorDouble &uPotSum,
1028 const unsigned int restI, const unsigned int restJ) {
1029 VectorDouble epsilon24s = highway::Undefined(tag_double);
1030 VectorDouble sigmaSquareds = highway::Undefined(tag_double);
1031 VectorDouble shift6s = highway::Undefined(tag_double);
1032
1033 if constexpr (useMixing) {
1034 fillPhysicsRegisters<remainderI, remainderJ, reversed, vecPattern>(typeID1Ptr, typeID2Ptr, epsilon24s,
1035 sigmaSquareds, shift6s, restI, restJ);
1036 } else {
1037 epsilon24s = highway::Set(tag_double, _epsilon24AoS);
1038 sigmaSquareds = highway::Set(tag_double, _sigmaSquareAoS);
1039 if constexpr (applyShift) {
1040 shift6s = highway::Set(tag_double, _shift6AoS);
1041 } else {
1042 shift6s = highway::Zero(tag_double);
1043 }
1044 }
1045
1046 VectorDouble x2;
1047 VectorDouble y2;
1048 VectorDouble z2;
1049 MaskDouble ownedMaskJ;
1050
1051 fillJRegisters<remainderJ, vecPattern>(j, x2Ptr, y2Ptr, z2Ptr, ownedStatePtr2, x2, y2, z2, ownedMaskJ, restJ);
1052
1053 // distance calculations
1054 const auto drX = highway::Sub(x1, x2);
1055 const auto drY = highway::Sub(y1, y2);
1056 const auto drZ = highway::Sub(z1, z2);
1057
1058 const auto drX2 = highway::Mul(drX, drX);
1059 const auto drY2 = highway::Mul(drY, drY);
1060 const auto drZ2 = highway::Mul(drZ, drZ);
1061
1062 const auto dr2 = highway::Add(highway::Add(drX2, drY2), drZ2);
1063
1064 VectorDouble cutoffSquared = highway::Set(tag_double, _cutoffSquareAoS);
1065
1066 const auto dummyMask = highway::And(ownedMaskI, ownedMaskJ);
1067 const auto cutoffDummyMask = highway::MaskedLe(dummyMask, dr2, cutoffSquared);
1068
1069 if (highway::AllFalse(tag_double, cutoffDummyMask)) {
1070 return;
1071 }
1072
1073 // compute LJ Potential
1074 const auto invDr2 = highway::Div(highway::Set(tag_double, 1.0), dr2);
1075 const auto lj2 = highway::Mul(sigmaSquareds, invDr2);
1076 const auto lj4 = highway::Mul(lj2, lj2);
1077 const auto lj6 = highway::Mul(lj2, lj4);
1078 const auto lj12 = highway::Mul(lj6, lj6);
1079 const auto lj12m6 = highway::Sub(lj12, lj6);
1080 const auto lj12m6alj12 = highway::Add(lj12m6, lj12);
1081 const auto lj12m6alj12e = highway::Mul(lj12m6alj12, epsilon24s);
1082 const auto fac = highway::Mul(lj12m6alj12e, invDr2);
1083
1084 const auto facMasked = highway::IfThenElseZero(cutoffDummyMask, fac);
1085
1086 const VectorDouble fx = highway::Mul(drX, facMasked);
1087 const VectorDouble fy = highway::Mul(drY, facMasked);
1088 const VectorDouble fz = highway::Mul(drZ, facMasked);
1089
1090 fxAcc = highway::Add(fxAcc, fx);
1091 fyAcc = highway::Add(fyAcc, fy);
1092 fzAcc = highway::Add(fzAcc, fz);
1093
1094 if constexpr (newton3) {
1095 handleNewton3Reduction<remainderJ, vecPattern>(fx, fy, fz, fx2Ptr, fy2Ptr, fz2Ptr, j, restJ);
1096 }
1097
1098 if constexpr (calculateGlobals) {
1099 auto virialX = highway::Mul(fx, drX);
1100 auto virialY = highway::Mul(fy, drY);
1101 auto virialZ = highway::Mul(fz, drZ);
1102
1103 auto uPot = highway::MulAdd(epsilon24s, lj12m6, shift6s);
1104 auto uPotMasked = highway::IfThenElseZero(cutoffDummyMask, uPot);
1105
1106 auto energyFactor = highway::MaskedSet(tag_double, dummyMask, 1.0);
1107
1108 if constexpr (newton3) {
1109 energyFactor = highway::Add(energyFactor, highway::MaskedSet(tag_double, dummyMask, 1.0));
1110 }
1111
1112 uPotSum = highway::MulAdd(energyFactor, uPotMasked, uPotSum);
1113 virialSumX = highway::MulAdd(energyFactor, virialX, virialSumX);
1114 virialSumY = highway::MulAdd(energyFactor, virialY, virialSumY);
1115 virialSumZ = highway::MulAdd(energyFactor, virialZ, virialSumZ);
1116 }
1117 }
1118
1119 public:
1120 // clang-format off
1126 // clang-format on
1127 inline void SoAFunctorVerlet(autopas::SoAView<SoAArraysType> soa, const size_t indexFirst,
1128 const std::vector<size_t, autopas::AlignedAllocator<size_t>> &neighborList,
1129 bool newton3) final {
1130 if (soa.size() == 0 or neighborList.empty()) return;
1131 if (newton3) {
1132 SoAFunctorVerletImpl<true>(soa, indexFirst, neighborList);
1133 } else {
1134 SoAFunctorVerletImpl<false>(soa, indexFirst, neighborList);
1135 }
1136 }
1137
1138 private:
1139 template <bool newton3>
1140 inline void SoAFunctorVerletImpl(autopas::SoAView<SoAArraysType> soa, const size_t indexFirst,
1141 const std::vector<size_t, autopas::AlignedAllocator<size_t>> &neighborList) {
1142 const auto *const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
1143 if (ownedStatePtr[indexFirst] == autopas::OwnershipState::dummy) {
1144 return;
1145 }
1146
1147 const auto *const __restrict xPtr = soa.template begin<Particle_T::AttributeNames::posX>();
1148 const auto *const __restrict yPtr = soa.template begin<Particle_T::AttributeNames::posY>();
1149 const auto *const __restrict zPtr = soa.template begin<Particle_T::AttributeNames::posZ>();
1150
1151 auto *const __restrict fxPtr = soa.template begin<Particle_T::AttributeNames::forceX>();
1152 auto *const __restrict fyPtr = soa.template begin<Particle_T::AttributeNames::forceY>();
1153 auto *const __restrict fzPtr = soa.template begin<Particle_T::AttributeNames::forceZ>();
1154
1155 const auto *const __restrict typeIDPtr = soa.template begin<Particle_T::AttributeNames::typeId>();
1156
1157 VectorDouble virialSumX = highway::Zero(tag_double);
1158 VectorDouble virialSumY = highway::Zero(tag_double);
1159 VectorDouble virialSumZ = highway::Zero(tag_double);
1160 VectorDouble uPotSum = highway::Zero(tag_double);
1161 VectorDouble fxAcc = highway::Zero(tag_double);
1162 VectorDouble fyAcc = highway::Zero(tag_double);
1163 VectorDouble fzAcc = highway::Zero(tag_double);
1164
1165 const VectorDouble x1 = highway::Set(tag_double, xPtr[indexFirst]);
1166 const VectorDouble y1 = highway::Set(tag_double, yPtr[indexFirst]);
1167 const VectorDouble z1 = highway::Set(tag_double, zPtr[indexFirst]);
1168 const auto ownedI = static_cast<int64_t>(ownedStatePtr[indexFirst]);
1169 const VectorDouble ownedStateI = highway::Set(tag_double, static_cast<double>(ownedI));
1170 const MaskDouble ownedMaskI = highway::Ne(ownedStateI, highway::Zero(tag_double));
1171
1172 // We overestimate the array size. This should be a tight/perfect upper bound on x86, and should never be large
1173 // enough on e.g. ARM/RISC-V to cause issues.
1174 HWY_ALIGN std::array<double, _maxVecLengthDouble> x2Tmp{};
1175 HWY_ALIGN std::array<double, _maxVecLengthDouble> y2Tmp{};
1176 HWY_ALIGN std::array<double, _maxVecLengthDouble> z2Tmp{};
1177 HWY_ALIGN std::array<double, _maxVecLengthDouble> fx2Tmp{};
1178 HWY_ALIGN std::array<double, _maxVecLengthDouble> fy2Tmp{};
1179 HWY_ALIGN std::array<double, _maxVecLengthDouble> fz2Tmp{};
1180 HWY_ALIGN std::array<size_t, _maxVecLengthDouble> typeID2Tmp{};
1181 // ownedStates2Tmp is int64_t because we can directly use static_cast on the individual elements below, unlike other
1182 // functors where we have to resort to unsafe reinterpret_cast
1183 HWY_ALIGN std::array<int64_t, _maxVecLengthDouble> ownedStates2Tmp;
1184 ownedStates2Tmp.fill(static_cast<int64_t>(autopas::OwnershipState::dummy));
1185
1186 size_t j = 0;
1187 const size_t vecEnd = (neighborList.size() / _vecLengthDouble) * _vecLengthDouble;
1188
1189 for (; j < vecEnd; j += _vecLengthDouble) {
1190 // load neighbor particles in consecutive array
1191 for (long vecIndex = 0; vecIndex < _vecLengthDouble; ++vecIndex) {
1192 x2Tmp[vecIndex] = xPtr[neighborList[j + vecIndex]];
1193 y2Tmp[vecIndex] = yPtr[neighborList[j + vecIndex]];
1194 z2Tmp[vecIndex] = zPtr[neighborList[j + vecIndex]];
1195 if constexpr (newton3) {
1196 fx2Tmp[vecIndex] = fxPtr[neighborList[j + vecIndex]];
1197 fy2Tmp[vecIndex] = fyPtr[neighborList[j + vecIndex]];
1198 fz2Tmp[vecIndex] = fzPtr[neighborList[j + vecIndex]];
1199 }
1200 typeID2Tmp[vecIndex] = typeIDPtr[neighborList[j + vecIndex]];
1201 const auto ownedState = ownedStatePtr[neighborList[j + vecIndex]];
1202 ownedStates2Tmp[vecIndex] = static_cast<int64_t>(ownedState);
1203 }
1204
1205 SoAKernel<newton3, false, false, false, VectorizationPattern::p1xVec>(
1206 0, 0, ownedMaskI, ownedStates2Tmp.data(), x1, y1, z1, x2Tmp.data(), y2Tmp.data(), z2Tmp.data(), fx2Tmp.data(),
1207 fy2Tmp.data(), fz2Tmp.data(), &typeIDPtr[indexFirst], typeID2Tmp.data(), fxAcc, fyAcc, fzAcc, virialSumX,
1208 virialSumY, virialSumZ, uPotSum, 0, 0);
1209
1210 if constexpr (newton3) {
1211 for (size_t vecIndex = 0; vecIndex < _vecLengthDouble; ++vecIndex) {
1212 fxPtr[neighborList[j + vecIndex]] = fx2Tmp[vecIndex];
1213 fyPtr[neighborList[j + vecIndex]] = fy2Tmp[vecIndex];
1214 fzPtr[neighborList[j + vecIndex]] = fz2Tmp[vecIndex];
1215 }
1216 }
1217 }
1218
1219 const int rest = static_cast<int>(neighborList.size() & (_vecLengthDouble - 1));
1220
1221 if (rest > 0) {
1222 for (size_t vecIndex = 0; vecIndex < rest; ++vecIndex) {
1223 x2Tmp[vecIndex] = xPtr[neighborList[j + vecIndex]];
1224 y2Tmp[vecIndex] = yPtr[neighborList[j + vecIndex]];
1225 z2Tmp[vecIndex] = zPtr[neighborList[j + vecIndex]];
1226 if constexpr (newton3) {
1227 fx2Tmp[vecIndex] = fxPtr[neighborList[j + vecIndex]];
1228 fy2Tmp[vecIndex] = fyPtr[neighborList[j + vecIndex]];
1229 fz2Tmp[vecIndex] = fzPtr[neighborList[j + vecIndex]];
1230 }
1231 typeID2Tmp[vecIndex] = typeIDPtr[neighborList[j + vecIndex]];
1232 const auto ownedState = ownedStatePtr[neighborList[j + vecIndex]];
1233 ownedStates2Tmp[vecIndex] = static_cast<int64_t>(ownedState);
1234 }
1235
1236 SoAKernel<newton3, false, true, false, VectorizationPattern::p1xVec>(
1237 0, 0, ownedMaskI, ownedStates2Tmp.data(), x1, y1, z1, x2Tmp.data(), y2Tmp.data(), z2Tmp.data(), fx2Tmp.data(),
1238 fy2Tmp.data(), fz2Tmp.data(), &typeIDPtr[indexFirst], typeID2Tmp.data(), fxAcc, fyAcc, fzAcc, virialSumX,
1239 virialSumY, virialSumZ, uPotSum, 0, rest);
1240
1241 if constexpr (newton3) {
1242 for (long vecIndex = 0; vecIndex < _vecLengthDouble && vecIndex < rest; ++vecIndex) {
1243 fxPtr[neighborList[j + vecIndex]] = fx2Tmp[vecIndex];
1244 fyPtr[neighborList[j + vecIndex]] = fy2Tmp[vecIndex];
1245 fzPtr[neighborList[j + vecIndex]] = fz2Tmp[vecIndex];
1246 }
1247 }
1248 }
1249
1250 fxPtr[indexFirst] += highway::ReduceSum(tag_double, fxAcc);
1251 fyPtr[indexFirst] += highway::ReduceSum(tag_double, fyAcc);
1252 fzPtr[indexFirst] += highway::ReduceSum(tag_double, fzAcc);
1253
1254 if constexpr (calculateGlobals) {
1255 computeGlobals(virialSumX, virialSumY, virialSumZ, uPotSum);
1256 }
1257 }
1258
1259 public:
1263 constexpr static auto getNeededAttr() {
1264 return std::array<typename Particle_T::AttributeNames, 9>{Particle_T::AttributeNames::id,
1265 Particle_T::AttributeNames::posX,
1266 Particle_T::AttributeNames::posY,
1267 Particle_T::AttributeNames::posZ,
1268 Particle_T::AttributeNames::forceX,
1269 Particle_T::AttributeNames::forceY,
1270 Particle_T::AttributeNames::forceZ,
1271 Particle_T::AttributeNames::typeId,
1272 Particle_T::AttributeNames::ownershipState};
1273 }
1274
1278 constexpr static auto getNeededAttr(std::false_type) {
1279 return std::array<typename Particle_T::AttributeNames, 6>{
1280 Particle_T::AttributeNames::id, Particle_T::AttributeNames::posX,
1281 Particle_T::AttributeNames::posY, Particle_T::AttributeNames::posZ,
1282 Particle_T::AttributeNames::typeId, Particle_T::AttributeNames::ownershipState};
1283 }
1284
1288 constexpr static auto getComputedAttr() {
1289 return std::array<typename Particle_T::AttributeNames, 3>{
1290 Particle_T::AttributeNames::forceX, Particle_T::AttributeNames::forceY, Particle_T::AttributeNames::forceZ};
1291 }
1292
1297 constexpr static bool getMixing() { return useMixing; }
1298
1303 void initTraversal() final {
1304 _potentialEnergySum = 0.;
1305 _virialSum = {0., 0., 0.};
1306 _postProcessed = false;
1307 for (size_t i = 0; i < _aosThreadData.size(); ++i) {
1308 _aosThreadData[i].setZero();
1309 }
1310 }
1311
1316 void endTraversal(const bool newton3) final {
1317 using namespace autopas::utils::ArrayMath::literals;
1318
1319 if (_postProcessed) {
1321 "Already postprocessed, endTraversal(bool newton3) was called twice without calling initTraversal().");
1322 }
1323
1324 if (calculateGlobals) {
1325 for (size_t i = 0; i < _aosThreadData.size(); ++i) {
1326 _potentialEnergySum += _aosThreadData[i].potentialEnergySum;
1327 _virialSum += _aosThreadData[i].virialSum;
1328 }
1329 // For each interaction, we added the full contribution for both particles. Divide by 2 here, so that each
1330 // contribution is only counted once per pair.
1331 _potentialEnergySum *= 0.5;
1332 _virialSum *= 0.5;
1333
1334 // We have always calculated 6*potentialEnergy, so we divide by 6 here!
1335 _potentialEnergySum /= 6.;
1336 _postProcessed = true;
1337
1338 AutoPasLog(DEBUG, "Final potential energy {}", _potentialEnergySum);
1339 AutoPasLog(DEBUG, "Final virial {}", _virialSum[0] + _virialSum[1] + _virialSum[2]);
1340 }
1341 }
1342
1347 double getPotentialEnergy() const {
1348 if (not calculateGlobals) {
1350 "Trying to get upot even though calculateGlobals is false. If you want this functor to calculate global "
1351 "values, please specify calculateGlobals to be true.");
1352 }
1353 if (not _postProcessed) {
1354 throw autopas::utils::ExceptionHandler::AutoPasException("Cannot get upot, because endTraversal was not called.");
1355 }
1356 return _potentialEnergySum;
1357 }
1358
1363 double getVirial() const {
1364 if (not calculateGlobals) {
1366 "Trying to get virial even though calculateGlobals is false. If you want this functor to calculate global "
1367 "values, please specify calculateGlobals to be true.");
1368 }
1369 if (not _postProcessed) {
1371 "Cannot get virial, because endTraversal was not called.");
1372 }
1373 return _virialSum[0] + _virialSum[1] + _virialSum[2];
1374 }
1375
1384 void setParticleProperties(const double epsilon24, const double sigmaSquare) {
1385 _epsilon24AoS = epsilon24;
1386 _sigmaSquareAoS = sigmaSquare;
1387 if constexpr (applyShift) {
1388 _shift6AoS = ParticlePropertiesLibrary<double, size_t>::calcShift6(epsilon24, sigmaSquare, _cutoffSquareAoS);
1389 } else {
1390 _shift6AoS = 0.;
1391 }
1392 }
1393
1397 void setVecPattern(const VectorizationPattern vecPattern) final { _vecPattern = vecPattern; }
1398
1399 private:
1403 class AoSThreadData {
1404 public:
1405 AoSThreadData() : virialSum{0., 0., 0.}, potentialEnergySum{0.} {}
1406
1407 void setZero() {
1408 virialSum = {0., 0., 0.};
1409 potentialEnergySum = 0.;
1410 }
1411
1412 std::array<double, 3> virialSum{};
1413 double potentialEnergySum{};
1414
1415 private:
1416 double __remainingTo64[(64 - 4 * sizeof(double)) / sizeof(double)];
1417 };
1418 static_assert(sizeof(AoSThreadData) % 64 == 0, "AoSThreadData has wrong size");
1419
1420 // cutoff squared used in the AoS functor.
1421 const double _cutoffSquareAoS{0.};
1422 // epsilon, sigma and shift6 used in the AoS functor.
1423 double _epsilon24AoS{0.}, _sigmaSquareAoS{0.}, _shift6AoS{0.};
1424
1425 // optional to hold a reference to the ParticlePropertiesLibrary. If a ParticlePropertiesLibrary is not used the
1426 // optional is empty.
1427 std::optional<std::reference_wrapper<ParticlePropertiesLibrary<double, size_t>>> _PPLibrary;
1428
1429 // accumulators for the globals (potential energy and virial).
1430 double _potentialEnergySum{0.};
1431 std::array<double, 3> _virialSum{0., 0., 0.};
1432 std::vector<AoSThreadData> _aosThreadData{};
1433
1434 // flag to indicate whether post-processing has been performed.
1435 bool _postProcessed{false};
1436
1437 // The Vectorization Pattern currently used in the SoA functor.
1438 VectorizationPattern _vecPattern{};
1439
1440 // Vectorization Pattern that the functor can handle.
1441 static constexpr std::array<VectorizationPattern, 4> _vecPatternsAllowed = {
1442 VectorizationPattern::p1xVec, VectorizationPattern::p2xVecDiv2, VectorizationPattern::pVecDiv2x2,
1443 VectorizationPattern::pVecx1};
1444};
1445} // namespace mdLib
decltype(highway::FirstN(tag_long, 2)) MaskLong
Type for a Long Mask.
Definition: LJFunctorHWY.h:46
constexpr highway::Half< highway::DFromV< VectorDouble > > tag_double_half
Highway tag for a half-filled double register.
Definition: LJFunctorHWY.h:42
decltype(highway::Zero(tag_double)) VectorDouble
Type for a Double vector register.
Definition: LJFunctorHWY.h:38
constexpr highway::ScalableTag< double > tag_double
Highway tag for full double register.
Definition: LJFunctorHWY.h:26
constexpr size_t _maxVecLengthDouble
Upper bound on the number of double values in a full register.
Definition: LJFunctorHWY.h:36
autopas::VectorizationPatternOption::Value VectorizationPattern
Vectorization Pattern Type.
Definition: LJFunctorHWY.h:48
decltype(highway::Zero(tag_long)) VectorLong
Type for a Long vector register.
Definition: LJFunctorHWY.h:40
HWY_LANES_CONSTEXPR size_t _vecLengthDouble
Number of double values in a full register.
Definition: LJFunctorHWY.h:32
constexpr highway::ScalableTag< int64_t > tag_long
Highway tag for full long register.
Definition: LJFunctorHWY.h:28
decltype(highway::FirstN(tag_double, 1)) MaskDouble
Type for a Double Mask.
Definition: LJFunctorHWY.h:44
#define AutoPasLog(lvl, fmt,...)
Macro for logging providing common meta information without filename.
Definition: Logger.h:24
This class stores the (physical) properties of molecule types, and, in the case of multi-site molecul...
Definition: ParticlePropertiesLibrary.h:28
static double calcShift6(double epsilon24, double sigmaSquared, double cutoffSquared)
Calculate the shift multiplied 6 of the lennard jones potential from given cutoff,...
Definition: ParticlePropertiesLibrary.h:576
AlignedAllocator class.
Definition: AlignedAllocator.h:29
PairwiseFunctor class.
Definition: PairwiseFunctor.h:31
PairwiseFunctor(double cutoff)
Constructor.
Definition: PairwiseFunctor.h:42
View on a fixed part of a SoA between a start index and an end index.
Definition: SoAView.h:23
size_t size() const
Returns the number of particles in the view.
Definition: SoAView.h:83
Default exception class for autopas exceptions.
Definition: ExceptionHandler.h:116
static void exception(const Exception e)
Handle an exception derived by std::exception.
Definition: ExceptionHandler.h:64
A functor to handle lennard-jones interactions between two particles (molecules) This functor uses th...
Definition: LJFunctorHWY.h:67
double getVirial() const
Get the virial.
Definition: LJFunctorHWY.h:1363
bool isRelevantForTuning() final
Specifies whether the functor should be considered for the auto-tuning process.
Definition: LJFunctorHWY.h:108
void SoAFunctorPair(autopas::SoAView< SoAArraysType > soa1, autopas::SoAView< SoAArraysType > soa2, bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorHWY.h:232
LJFunctorHWY(double cutoff, std::optional< std::reference_wrapper< ParticlePropertiesLibrary< double, size_t > > > particlePropertiesLibrary=std::nullopt)
Constructor for Functor with mixing enabled/disabled.
Definition: LJFunctorHWY.h:82
void setVecPattern(const VectorizationPattern vecPattern) final
Setter for the vectorization pattern to be used.
Definition: LJFunctorHWY.h:1397
void initTraversal() final
Reset the global values.
Definition: LJFunctorHWY.h:1303
void SoAFunctorSingle(autopas::SoAView< SoAArraysType > soa, const bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorHWY.h:189
void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final
PairwiseFunctor for arrays of structures (AoS).
Definition: LJFunctorHWY.h:132
bool allowsNonNewton3() final
Specifies whether the functor is capable of non-Newton3-like functors.
Definition: LJFunctorHWY.h:114
static constexpr auto getNeededAttr()
Get attributes needed for computation.
Definition: LJFunctorHWY.h:1263
static constexpr auto getNeededAttr(std::false_type)
Get attributes needed for computation without N3 optimization.
Definition: LJFunctorHWY.h:1278
LJFunctorHWY()=delete
Deleted default constructor.
static constexpr bool getMixing()
Definition: LJFunctorHWY.h:1297
std::string getName() final
Returns name of functor.
Definition: LJFunctorHWY.h:106
static constexpr auto getComputedAttr()
Get attributes computed by this functor.
Definition: LJFunctorHWY.h:1288
bool allowsNewton3() final
Specifies whether the functor is capable of Newton3-like functors.
Definition: LJFunctorHWY.h:110
void SoAFunctorVerlet(autopas::SoAView< SoAArraysType > soa, const size_t indexFirst, const std::vector< size_t, autopas::AlignedAllocator< size_t > > &neighborList, bool newton3) final
PairwiseFunctor for structure of arrays (SoA) for neighbor lists.
Definition: LJFunctorHWY.h:1127
double getPotentialEnergy() const
Get the potential Energy.
Definition: LJFunctorHWY.h:1347
bool isVecPatternAllowed(const VectorizationPattern vecPattern) final
Specifies whether the functor is capable of using the specified Vectorization Pattern in the SoA func...
Definition: LJFunctorHWY.h:125
void setParticleProperties(const double epsilon24, const double sigmaSquare)
Sets the particle properties constants for this functor.
Definition: LJFunctorHWY.h:1384
void endTraversal(const bool newton3) final
Accumulates global values, e.g.
Definition: LJFunctorHWY.h:1316
constexpr T dot(const std::array< T, SIZE > &a, const std::array< T, SIZE > &b)
Generates the dot product of two arrays.
Definition: ArrayMath.h:233
This is the main namespace of AutoPas.
Definition: AutoPasDecl.h:34
int autopas_get_max_threads()
Dummy for omp_get_max_threads() when no OpenMP is available.
Definition: WrapOpenMP.h:144
OwnershipState
Enum that specifies the state of ownership.
Definition: OwnershipState.h:20
@ dummy
Dummy or deleted state, a particle with this state is not an actual particle!
@ owned
Owned state, a particle with this state is an actual particle and owned by the current AutoPas object...
FunctorN3Modes
Newton 3 modes for the Functor.
Definition: Functor.h:23
int autopas_get_thread_num()
Dummy for omp_set_lock() when no OpenMP is available.
Definition: WrapOpenMP.h:132