AutoPas  3.0.0
Loading...
Searching...
No Matches
LJFunctorAVX.h
Go to the documentation of this file.
1
7#pragma once
8#ifndef __AVX__
9#pragma message "Requested to compile LJFunctorAVX but AVX is not available!"
10#else
11#include <immintrin.h>
12#endif
13
14#include <array>
15
23#include "autopas/utils/inBox.h"
24
25namespace mdLib {
26
45template <class Particle_T, bool applyShift = false, bool useMixing = false,
46 autopas::FunctorN3Modes useNewton3 = autopas::FunctorN3Modes::Both, bool calculateGlobals = false,
47 bool countFLOPs = false, bool relevantForTuning = true>
49 : public autopas::PairwiseFunctor<Particle_T, LJFunctorAVX<Particle_T, applyShift, useMixing, useNewton3,
50 calculateGlobals, countFLOPs, relevantForTuning>> {
51 using SoAArraysType = typename Particle_T::SoAArraysType;
52
53 public:
57 LJFunctorAVX() = delete;
58
59 private:
65 explicit LJFunctorAVX(double cutoff, void * /*dummy*/)
66#ifdef __AVX__
67 : autopas::PairwiseFunctor<Particle_T, LJFunctorAVX<Particle_T, applyShift, useMixing, useNewton3,
68 calculateGlobals, countFLOPs, relevantForTuning>>(cutoff),
69 _cutoffSquared{_mm256_set1_pd(cutoff * cutoff)},
70 _cutoffSquaredAoS(cutoff * cutoff),
71 _potentialEnergySum{0.},
72 _virialSum{0., 0., 0.},
73 _aosThreadData(),
74 _postProcessed{false} {
75 if (calculateGlobals) {
76 _aosThreadData.resize(autopas::autopas_get_max_threads());
77 }
78 if constexpr (countFLOPs) {
79 AutoPasLog(DEBUG, "Using LJFunctorAVX with countFLOPs but FLOP counting is not implemented.");
80 }
81 }
82#else
83 : autopas::Functor<Particle_T, LJFunctorAVX<Particle_T, applyShift, useMixing, useNewton3, calculateGlobals,
84 relevantForTuning>>(cutoff) {
85 autopas::utils::ExceptionHandler::exception("AutoPas was compiled without AVX support!");
86 }
87#endif
88 public:
97 explicit LJFunctorAVX(double cutoff) : LJFunctorAVX(cutoff, nullptr) {
98 static_assert(not useMixing,
99 "Mixing without a ParticlePropertiesLibrary is not possible! Use a different constructor or set "
100 "mixing to false.");
101 }
102
109 explicit LJFunctorAVX(double cutoff, ParticlePropertiesLibrary<double, size_t> &particlePropertiesLibrary)
110 : LJFunctorAVX(cutoff, nullptr) {
111 static_assert(useMixing,
112 "Not using Mixing but using a ParticlePropertiesLibrary is not allowed! Use a different constructor "
113 "or set mixing to true.");
114 _PPLibrary = &particlePropertiesLibrary;
115 }
116
117 std::string getName() final { return "LJFunctorAVX"; }
118
119 bool isRelevantForTuning() final { return relevantForTuning; }
120
121 bool allowsNewton3() final {
122 return useNewton3 == autopas::FunctorN3Modes::Newton3Only or useNewton3 == autopas::FunctorN3Modes::Both;
123 }
124
125 bool allowsNonNewton3() final {
126 return useNewton3 == autopas::FunctorN3Modes::Newton3Off or useNewton3 == autopas::FunctorN3Modes::Both;
127 }
128
129 inline void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final {
130 using namespace autopas::utils::ArrayMath::literals;
131 if (i.isDummy() or j.isDummy()) {
132 return;
133 }
134 auto sigmaSquared = _sigmaSquaredAoS;
135 auto epsilon24 = _epsilon24AoS;
136 auto shift6 = _shift6AoS;
137 if constexpr (useMixing) {
138 sigmaSquared = _PPLibrary->getMixingSigmaSquared(i.getTypeId(), j.getTypeId());
139 epsilon24 = _PPLibrary->getMixing24Epsilon(i.getTypeId(), j.getTypeId());
140 if constexpr (applyShift) {
141 shift6 = _PPLibrary->getMixingShift6(i.getTypeId(), j.getTypeId());
142 }
143 }
144 auto dr = i.getR() - j.getR();
145 double dr2 = autopas::utils::ArrayMath::dot(dr, dr);
146
147 if (dr2 > _cutoffSquaredAoS) {
148 return;
149 }
150
151 double invdr2 = 1. / dr2;
152 double lj6 = sigmaSquared * invdr2;
153 lj6 = lj6 * lj6 * lj6;
154 double lj12 = lj6 * lj6;
155 double lj12m6 = lj12 - lj6;
156 double fac = epsilon24 * (lj12 + lj12m6) * invdr2;
157 auto f = dr * fac;
158 i.addF(f);
159 if (newton3) {
160 // only if we use newton 3 here, we want to
161 j.subF(f);
162 }
163 if (calculateGlobals) {
164 // We always add the full contribution for each owned particle and divide the sums by 2 in endTraversal().
165 // Potential energy has an additional factor of 6, which is also handled in endTraversal().
166
167 auto virial = dr * f;
168 double potentialEnergy6 = epsilon24 * lj12m6 + shift6;
169
170 const int threadnum = autopas::autopas_get_thread_num();
171 if (i.isOwned()) {
172 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
173 _aosThreadData[threadnum].virialSum += virial;
174 }
175 // for non-newton3 the second particle will be considered in a separate calculation
176 if (newton3 and j.isOwned()) {
177 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
178 _aosThreadData[threadnum].virialSum += virial;
179 }
180 }
181 }
182
187 inline void SoAFunctorSingle(autopas::SoAView<SoAArraysType> soa, bool newton3) final { SoAFunctorSingleImpl(soa); }
188
189 // clang-format off
193 // clang-format on
195 const bool newton3) final {
196 if (newton3) {
197 SoAFunctorPairImpl<true>(soa1, soa2);
198 } else {
199 SoAFunctorPairImpl<false>(soa1, soa2);
200 }
201 }
202
203 private:
208 inline void SoAFunctorSingleImpl(autopas::SoAView<SoAArraysType> soa) {
209#ifdef __AVX__
210 if (soa.size() == 0) return;
211
212 const auto *const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
213 const auto *const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
214 const auto *const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
215
216 const auto *const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
217
218 auto *const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
219 auto *const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
220 auto *const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
221
222 const auto *const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
223
224 __m256d virialSumX = _mm256_setzero_pd();
225 __m256d virialSumY = _mm256_setzero_pd();
226 __m256d virialSumZ = _mm256_setzero_pd();
227 __m256d potentialEnergySum = _mm256_setzero_pd();
228
229 // reverse outer loop s.th. inner loop always beginns at aligned array start
230 // typecast to detect underflow
231 for (size_t i = soa.size() - 1; (long)i >= 0; --i) {
232 if (ownedStatePtr[i] == autopas::OwnershipState::dummy) {
233 // If the i-th particle is a dummy, skip this loop iteration.
234 continue;
235 }
236
237 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
238 "OwnershipStates underlying type should be int64_t!");
239 // ownedStatePtr contains int64_t, so we broadcast these to make an __m256i.
240 // _mm256_set1_epi64x broadcasts a 64-bit integer, we use this instruction to have 4 values!
241 __m256i ownedStateI = _mm256_set1_epi64x(static_cast<int64_t>(ownedStatePtr[i]));
242
243 __m256d fxacc = _mm256_setzero_pd();
244 __m256d fyacc = _mm256_setzero_pd();
245 __m256d fzacc = _mm256_setzero_pd();
246
247 const __m256d x1 = _mm256_broadcast_sd(&xptr[i]);
248 const __m256d y1 = _mm256_broadcast_sd(&yptr[i]);
249 const __m256d z1 = _mm256_broadcast_sd(&zptr[i]);
250
251 size_t j = 0;
252 // floor soa numParticles to multiple of vecLength
253 // If b is a power of 2 the following holds:
254 // a & ~(b -1) == a - (a mod b)
255 for (; j < (i & ~(vecLength - 1)); j += 4) {
256 SoAKernel<true, false>(j, ownedStateI, reinterpret_cast<const int64_t *>(ownedStatePtr), x1, y1, z1, xptr, yptr,
257 zptr, fxptr, fyptr, fzptr, &typeIDptr[i], &typeIDptr[j], fxacc, fyacc, fzacc,
258 &virialSumX, &virialSumY, &virialSumZ, &potentialEnergySum, 0);
259 }
260 // If b is a power of 2 the following holds:
261 // a & (b -1) == a mod b
262 const int rest = (int)(i & (vecLength - 1));
263 if (rest > 0) {
264 SoAKernel<true, true>(j, ownedStateI, reinterpret_cast<const int64_t *>(ownedStatePtr), x1, y1, z1, xptr, yptr,
265 zptr, fxptr, fyptr, fzptr, &typeIDptr[i], &typeIDptr[j], fxacc, fyacc, fzacc, &virialSumX,
266 &virialSumY, &virialSumZ, &potentialEnergySum, rest);
267 }
268
269 // horizontally reduce fDacc to sumfD
270 const __m256d hSumfxfy = _mm256_hadd_pd(fxacc, fyacc);
271 const __m256d hSumfz = _mm256_hadd_pd(fzacc, fzacc);
272
273 const __m128d hSumfxfyLow = _mm256_extractf128_pd(hSumfxfy, 0);
274 const __m128d hSumfzLow = _mm256_extractf128_pd(hSumfz, 0);
275
276 const __m128d hSumfxfyHigh = _mm256_extractf128_pd(hSumfxfy, 1);
277 const __m128d hSumfzHigh = _mm256_extractf128_pd(hSumfz, 1);
278
279 const __m128d sumfxfyVEC = _mm_add_pd(hSumfxfyLow, hSumfxfyHigh);
280 const __m128d sumfzVEC = _mm_add_pd(hSumfzLow, hSumfzHigh);
281
282 const double sumfx = sumfxfyVEC[0];
283 const double sumfy = sumfxfyVEC[1];
284 const double sumfz = _mm_cvtsd_f64(sumfzVEC);
285
286 fxptr[i] += sumfx;
287 fyptr[i] += sumfy;
288 fzptr[i] += sumfz;
289 }
290
291 if constexpr (calculateGlobals) {
292 const int threadnum = autopas::autopas_get_thread_num();
293
294 // horizontally reduce virialSumX and virialSumY
295 const __m256d hSumVirialxy = _mm256_hadd_pd(virialSumX, virialSumY);
296 const __m128d hSumVirialxyLow = _mm256_extractf128_pd(hSumVirialxy, 0);
297 const __m128d hSumVirialxyHigh = _mm256_extractf128_pd(hSumVirialxy, 1);
298 const __m128d hSumVirialxyVec = _mm_add_pd(hSumVirialxyHigh, hSumVirialxyLow);
299
300 // horizontally reduce virialSumZ and potentialEnergySum
301 const __m256d hSumVirialzPotentialEnergy = _mm256_hadd_pd(virialSumZ, potentialEnergySum);
302 const __m128d hSumVirialzPotentialEnergyLow = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 0);
303 const __m128d hSumVirialzPotentialEnergyHigh = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 1);
304 const __m128d hSumVirialzPotentialEnergyVec =
305 _mm_add_pd(hSumVirialzPotentialEnergyHigh, hSumVirialzPotentialEnergyLow);
306
307 // globals = {virialX, virialY, virialZ, potentialEnergy}
308 double globals[4];
309 _mm_store_pd(&globals[0], hSumVirialxyVec);
310 _mm_store_pd(&globals[2], hSumVirialzPotentialEnergyVec);
311
312 _aosThreadData[threadnum].virialSum[0] += globals[0];
313 _aosThreadData[threadnum].virialSum[1] += globals[1];
314 _aosThreadData[threadnum].virialSum[2] += globals[2];
315 _aosThreadData[threadnum].potentialEnergySum += globals[3];
316 }
317#endif
318 }
319
320 template <bool newton3>
321 inline void SoAFunctorPairImpl(autopas::SoAView<SoAArraysType> soa1, autopas::SoAView<SoAArraysType> soa2) {
322#ifdef __AVX__
323 if (soa1.size() == 0 || soa2.size() == 0) return;
324
325 const auto *const __restrict x1ptr = soa1.template begin<Particle_T::AttributeNames::posX>();
326 const auto *const __restrict y1ptr = soa1.template begin<Particle_T::AttributeNames::posY>();
327 const auto *const __restrict z1ptr = soa1.template begin<Particle_T::AttributeNames::posZ>();
328 const auto *const __restrict x2ptr = soa2.template begin<Particle_T::AttributeNames::posX>();
329 const auto *const __restrict y2ptr = soa2.template begin<Particle_T::AttributeNames::posY>();
330 const auto *const __restrict z2ptr = soa2.template begin<Particle_T::AttributeNames::posZ>();
331
332 const auto *const __restrict ownedStatePtr1 = soa1.template begin<Particle_T::AttributeNames::ownershipState>();
333 const auto *const __restrict ownedStatePtr2 = soa2.template begin<Particle_T::AttributeNames::ownershipState>();
334
335 auto *const __restrict fx1ptr = soa1.template begin<Particle_T::AttributeNames::forceX>();
336 auto *const __restrict fy1ptr = soa1.template begin<Particle_T::AttributeNames::forceY>();
337 auto *const __restrict fz1ptr = soa1.template begin<Particle_T::AttributeNames::forceZ>();
338 auto *const __restrict fx2ptr = soa2.template begin<Particle_T::AttributeNames::forceX>();
339 auto *const __restrict fy2ptr = soa2.template begin<Particle_T::AttributeNames::forceY>();
340 auto *const __restrict fz2ptr = soa2.template begin<Particle_T::AttributeNames::forceZ>();
341
342 const auto *const __restrict typeID1ptr = soa1.template begin<Particle_T::AttributeNames::typeId>();
343 const auto *const __restrict typeID2ptr = soa2.template begin<Particle_T::AttributeNames::typeId>();
344
345 __m256d virialSumX = _mm256_setzero_pd();
346 __m256d virialSumY = _mm256_setzero_pd();
347 __m256d virialSumZ = _mm256_setzero_pd();
348 __m256d potentialEnergySum = _mm256_setzero_pd();
349
350 for (unsigned int i = 0; i < soa1.size(); ++i) {
351 if (ownedStatePtr1[i] == autopas::OwnershipState::dummy) {
352 // If the i-th particle is a dummy, skip this loop iteration.
353 continue;
354 }
355
356 __m256d fxacc = _mm256_setzero_pd();
357 __m256d fyacc = _mm256_setzero_pd();
358 __m256d fzacc = _mm256_setzero_pd();
359
360 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
361 "OwnershipStates underlying type should be int64_t!");
362 // ownedStatePtr1 contains int64_t, so we broadcast these to make an __m256i.
363 // _mm256_set1_epi64x broadcasts a 64-bit integer, we use this instruction to have 4 values!
364 __m256i ownedStateI = _mm256_set1_epi64x(static_cast<int64_t>(ownedStatePtr1[i]));
365
366 const __m256d x1 = _mm256_broadcast_sd(&x1ptr[i]);
367 const __m256d y1 = _mm256_broadcast_sd(&y1ptr[i]);
368 const __m256d z1 = _mm256_broadcast_sd(&z1ptr[i]);
369
370 // floor soa2 numParticles to multiple of vecLength
371 unsigned int j = 0;
372 for (; j < (soa2.size() & ~(vecLength - 1)); j += 4) {
373 SoAKernel<newton3, false>(j, ownedStateI, reinterpret_cast<const int64_t *>(ownedStatePtr2), x1, y1, z1, x2ptr,
374 y2ptr, z2ptr, fx2ptr, fy2ptr, fz2ptr, &typeID1ptr[i], &typeID2ptr[j], fxacc, fyacc,
375 fzacc, &virialSumX, &virialSumY, &virialSumZ, &potentialEnergySum, 0);
376 }
377 const int rest = (int)(soa2.size() & (vecLength - 1));
378 if (rest > 0)
379 SoAKernel<newton3, true>(j, ownedStateI, reinterpret_cast<const int64_t *>(ownedStatePtr2), x1, y1, z1, x2ptr,
380 y2ptr, z2ptr, fx2ptr, fy2ptr, fz2ptr, &typeID1ptr[i], &typeID2ptr[j], fxacc, fyacc,
381 fzacc, &virialSumX, &virialSumY, &virialSumZ, &potentialEnergySum, rest);
382
383 // horizontally reduce fDacc to sumfD
384 const __m256d hSumfxfy = _mm256_hadd_pd(fxacc, fyacc);
385 const __m256d hSumfz = _mm256_hadd_pd(fzacc, fzacc);
386
387 const __m128d hSumfxfyLow = _mm256_extractf128_pd(hSumfxfy, 0);
388 const __m128d hSumfzLow = _mm256_extractf128_pd(hSumfz, 0);
389
390 const __m128d hSumfxfyHigh = _mm256_extractf128_pd(hSumfxfy, 1);
391 const __m128d hSumfzHigh = _mm256_extractf128_pd(hSumfz, 1);
392
393 const __m128d sumfxfyVEC = _mm_add_pd(hSumfxfyLow, hSumfxfyHigh);
394 const __m128d sumfzVEC = _mm_add_pd(hSumfzLow, hSumfzHigh);
395
396 const double sumfx = sumfxfyVEC[0];
397 const double sumfy = sumfxfyVEC[1];
398 const double sumfz = _mm_cvtsd_f64(sumfzVEC);
399
400 fx1ptr[i] += sumfx;
401 fy1ptr[i] += sumfy;
402 fz1ptr[i] += sumfz;
403 }
404
405 if constexpr (calculateGlobals) {
406 const int threadnum = autopas::autopas_get_thread_num();
407
408 // horizontally reduce virialSumX and virialSumY
409 const __m256d hSumVirialxy = _mm256_hadd_pd(virialSumX, virialSumY);
410 const __m128d hSumVirialxyLow = _mm256_extractf128_pd(hSumVirialxy, 0);
411 const __m128d hSumVirialxyHigh = _mm256_extractf128_pd(hSumVirialxy, 1);
412 const __m128d hSumVirialxyVec = _mm_add_pd(hSumVirialxyHigh, hSumVirialxyLow);
413
414 // horizontally reduce virialSumZ and potentialEnergySum
415 const __m256d hSumVirialzPotentialEnergy = _mm256_hadd_pd(virialSumZ, potentialEnergySum);
416 const __m128d hSumVirialzPotentialEnergyLow = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 0);
417 const __m128d hSumVirialzPotentialEnergyHigh = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 1);
418 const __m128d hSumVirialzPotentialEnergyVec =
419 _mm_add_pd(hSumVirialzPotentialEnergyHigh, hSumVirialzPotentialEnergyLow);
420
421 // globals = {virialX, virialY, virialZ, potentialEnergy}
422 double globals[4];
423 _mm_store_pd(&globals[0], hSumVirialxyVec);
424 _mm_store_pd(&globals[2], hSumVirialzPotentialEnergyVec);
425
426 _aosThreadData[threadnum].virialSum[0] += globals[0];
427 _aosThreadData[threadnum].virialSum[1] += globals[1];
428 _aosThreadData[threadnum].virialSum[2] += globals[2];
429 _aosThreadData[threadnum].potentialEnergySum += globals[3];
430 }
431#endif
432 }
433
434#ifdef __AVX__
464 template <bool newton3, bool remainderIsMasked>
465 inline void SoAKernel(const size_t j, const __m256i ownedStateI, const int64_t *const __restrict ownedStatePtr2,
466 const __m256d &x1, const __m256d &y1, const __m256d &z1, const double *const __restrict x2ptr,
467 const double *const __restrict y2ptr, const double *const __restrict z2ptr,
468 double *const __restrict fx2ptr, double *const __restrict fy2ptr,
469 double *const __restrict fz2ptr, const size_t *const typeID1ptr, const size_t *const typeID2ptr,
470 __m256d &fxacc, __m256d &fyacc, __m256d &fzacc, __m256d *virialSumX, __m256d *virialSumY,
471 __m256d *virialSumZ, __m256d *potentialEnergySum, const unsigned int rest = 0) {
472 __m256d epsilon24s = _epsilon24;
473 __m256d sigmaSquareds = _sigmaSquared;
474 __m256d shift6s = _shift6;
475 if (useMixing) {
476 // the first argument for set lands in the last bits of the register
477 epsilon24s = _mm256_set_pd(
478 not remainderIsMasked or rest > 3 ? _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 3)) : 0,
479 not remainderIsMasked or rest > 2 ? _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 2)) : 0,
480 not remainderIsMasked or rest > 1 ? _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 1)) : 0,
481 _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 0)));
482 sigmaSquareds = _mm256_set_pd(
483 not remainderIsMasked or rest > 3 ? _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 3)) : 0,
484 not remainderIsMasked or rest > 2 ? _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 2)) : 0,
485 not remainderIsMasked or rest > 1 ? _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 1)) : 0,
486 _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 0)));
487 if constexpr (applyShift) {
488 shift6s = _mm256_set_pd(
489 (not remainderIsMasked or rest > 3) ? _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 3)) : 0,
490 (not remainderIsMasked or rest > 2) ? _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 2)) : 0,
491 (not remainderIsMasked or rest > 1) ? _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 1)) : 0,
492 _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 0)));
493 }
494 }
495
496 const __m256d x2 = remainderIsMasked ? _mm256_maskload_pd(&x2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&x2ptr[j]);
497 const __m256d y2 = remainderIsMasked ? _mm256_maskload_pd(&y2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&y2ptr[j]);
498 const __m256d z2 = remainderIsMasked ? _mm256_maskload_pd(&z2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&z2ptr[j]);
499
500 const __m256d drx = _mm256_sub_pd(x1, x2);
501 const __m256d dry = _mm256_sub_pd(y1, y2);
502 const __m256d drz = _mm256_sub_pd(z1, z2);
503
504 const __m256d drx2 = _mm256_mul_pd(drx, drx);
505 const __m256d dry2 = _mm256_mul_pd(dry, dry);
506 const __m256d drz2 = _mm256_mul_pd(drz, drz);
507
508 const __m256d dr2PART = _mm256_add_pd(drx2, dry2);
509 const __m256d dr2 = _mm256_add_pd(dr2PART, drz2);
510
511 // _CMP_LE_OS == Less-Equal-then (ordered, signaling)
512 // signaling = throw error if NaN is encountered
513 // dr2 <= _cutoffSquared ? 0xFFFFFFFFFFFFFFFF : 0
514 const __m256d cutoffMask = _mm256_cmp_pd(dr2, _cutoffSquared, _CMP_LE_OS);
515
516 // This requires that dummy is zero (otherwise when loading using a mask the owned state will not be zero)
517 const __m256i ownedStateJ = remainderIsMasked
518 ? _mm256_castpd_si256(_mm256_maskload_pd(
519 reinterpret_cast<double const *>(&ownedStatePtr2[j]), _masks[rest - 1]))
520 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&ownedStatePtr2[j]));
521 // This requires that dummy is the first entry in OwnershipState!
522 const __m256d dummyMask = _mm256_cmp_pd(_mm256_castsi256_pd(ownedStateJ), _zero, _CMP_NEQ_UQ);
523 const __m256d cutoffDummyMask = _mm256_and_pd(cutoffMask, dummyMask);
524
525 // if everything is masked away return from this function.
526 if (_mm256_movemask_pd(cutoffDummyMask) == 0) {
527 return;
528 }
529
530 const __m256d invdr2 = _mm256_div_pd(_one, dr2);
531 const __m256d lj2 = _mm256_mul_pd(sigmaSquareds, invdr2);
532 const __m256d lj4 = _mm256_mul_pd(lj2, lj2);
533 const __m256d lj6 = _mm256_mul_pd(lj2, lj4);
534 const __m256d lj12 = _mm256_mul_pd(lj6, lj6);
535 const __m256d lj12m6 = _mm256_sub_pd(lj12, lj6);
536 const __m256d lj12m6alj12 = _mm256_add_pd(lj12m6, lj12);
537 const __m256d lj12m6alj12e = _mm256_mul_pd(lj12m6alj12, epsilon24s);
538 const __m256d fac = _mm256_mul_pd(lj12m6alj12e, invdr2);
539
540 const __m256d facMasked =
541 remainderIsMasked ? _mm256_and_pd(fac, _mm256_and_pd(cutoffDummyMask, _mm256_castsi256_pd(_masks[rest - 1])))
542 : _mm256_and_pd(fac, cutoffDummyMask);
543
544 const __m256d fx = _mm256_mul_pd(drx, facMasked);
545 const __m256d fy = _mm256_mul_pd(dry, facMasked);
546 const __m256d fz = _mm256_mul_pd(drz, facMasked);
547
548 fxacc = _mm256_add_pd(fxacc, fx);
549 fyacc = _mm256_add_pd(fyacc, fy);
550 fzacc = _mm256_add_pd(fzacc, fz);
551
552 // if newton 3 is used subtract fD from particle j
553 if constexpr (newton3) {
554 const __m256d fx2 =
555 remainderIsMasked ? _mm256_maskload_pd(&fx2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&fx2ptr[j]);
556 const __m256d fy2 =
557 remainderIsMasked ? _mm256_maskload_pd(&fy2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&fy2ptr[j]);
558 const __m256d fz2 =
559 remainderIsMasked ? _mm256_maskload_pd(&fz2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&fz2ptr[j]);
560
561 const __m256d fx2new = _mm256_sub_pd(fx2, fx);
562 const __m256d fy2new = _mm256_sub_pd(fy2, fy);
563 const __m256d fz2new = _mm256_sub_pd(fz2, fz);
564
565 remainderIsMasked ? _mm256_maskstore_pd(&fx2ptr[j], _masks[rest - 1], fx2new)
566 : _mm256_storeu_pd(&fx2ptr[j], fx2new);
567 remainderIsMasked ? _mm256_maskstore_pd(&fy2ptr[j], _masks[rest - 1], fy2new)
568 : _mm256_storeu_pd(&fy2ptr[j], fy2new);
569 remainderIsMasked ? _mm256_maskstore_pd(&fz2ptr[j], _masks[rest - 1], fz2new)
570 : _mm256_storeu_pd(&fz2ptr[j], fz2new);
571 }
572
573 if constexpr (calculateGlobals) {
574 // Global Virial
575 const __m256d virialX = _mm256_mul_pd(fx, drx);
576 const __m256d virialY = _mm256_mul_pd(fy, dry);
577 const __m256d virialZ = _mm256_mul_pd(fz, drz);
578
579 // Global Potential
580 const __m256d potentialEnergy = wrapperFMA(epsilon24s, lj12m6, shift6s);
581
582 const __m256d potentialEnergyMasked =
583 remainderIsMasked
584 ? _mm256_and_pd(potentialEnergy, _mm256_and_pd(cutoffDummyMask, _mm256_castsi256_pd(_masks[rest - 1])))
585 : _mm256_and_pd(potentialEnergy, cutoffDummyMask);
586
587 __m256d ownedMaskI =
588 _mm256_cmp_pd(_mm256_castsi256_pd(ownedStateI), _mm256_castsi256_pd(_ownedStateOwnedMM256i), _CMP_EQ_UQ);
589 __m256d energyFactor = _mm256_blendv_pd(_zero, _one, ownedMaskI);
590 if constexpr (newton3) {
591 __m256d ownedMaskJ =
592 _mm256_cmp_pd(_mm256_castsi256_pd(ownedStateJ), _mm256_castsi256_pd(_ownedStateOwnedMM256i), _CMP_EQ_UQ);
593 energyFactor = _mm256_add_pd(energyFactor, _mm256_blendv_pd(_zero, _one, ownedMaskJ));
594 }
595 *potentialEnergySum = wrapperFMA(energyFactor, potentialEnergyMasked, *potentialEnergySum);
596 *virialSumX = wrapperFMA(energyFactor, virialX, *virialSumX);
597 *virialSumY = wrapperFMA(energyFactor, virialY, *virialSumY);
598 *virialSumZ = wrapperFMA(energyFactor, virialZ, *virialSumZ);
599 }
600 }
601#endif
602
603 public:
604 // clang-format off
610 // clang-format on
611 inline void SoAFunctorVerlet(autopas::SoAView<SoAArraysType> soa, const size_t indexFirst,
612 const std::vector<size_t, autopas::AlignedAllocator<size_t>> &neighborList,
613 bool newton3) final {
614 if (soa.size() == 0 or neighborList.empty()) return;
615 if (newton3) {
616 SoAFunctorVerletImpl<true>(soa, indexFirst, neighborList);
617 } else {
618 SoAFunctorVerletImpl<false>(soa, indexFirst, neighborList);
619 }
620 }
621
622 private:
623 template <bool newton3>
624 inline void SoAFunctorVerletImpl(autopas::SoAView<SoAArraysType> soa, const size_t indexFirst,
625 const std::vector<size_t, autopas::AlignedAllocator<size_t>> &neighborList) {
626#ifdef __AVX__
627 const auto *const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
628 if (ownedStatePtr[indexFirst] == autopas::OwnershipState::dummy) {
629 return;
630 }
631
632 const auto *const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
633 const auto *const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
634 const auto *const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
635
636 auto *const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
637 auto *const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
638 auto *const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
639
640 const auto *const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
641
642 // accumulators
643 __m256d virialSumX = _mm256_setzero_pd();
644 __m256d virialSumY = _mm256_setzero_pd();
645 __m256d virialSumZ = _mm256_setzero_pd();
646 __m256d potentialEnergySum = _mm256_setzero_pd();
647 __m256d fxacc = _mm256_setzero_pd();
648 __m256d fyacc = _mm256_setzero_pd();
649 __m256d fzacc = _mm256_setzero_pd();
650
651 // broadcast particle 1
652 const auto x1 = _mm256_broadcast_sd(&xptr[indexFirst]);
653 const auto y1 = _mm256_broadcast_sd(&yptr[indexFirst]);
654 const auto z1 = _mm256_broadcast_sd(&zptr[indexFirst]);
655 // ownedStatePtr contains int64_t, so we broadcast these to make an __m256i.
656 // _mm256_set1_epi64x broadcasts a 64-bit integer, we use this instruction to have 4 values!
657 __m256i ownedStateI = _mm256_set1_epi64x(static_cast<int64_t>(ownedStatePtr[indexFirst]));
658
659 alignas(64) std::array<double, vecLength> x2tmp{};
660 alignas(64) std::array<double, vecLength> y2tmp{};
661 alignas(64) std::array<double, vecLength> z2tmp{};
662 alignas(64) std::array<double, vecLength> fx2tmp{};
663 alignas(64) std::array<double, vecLength> fy2tmp{};
664 alignas(64) std::array<double, vecLength> fz2tmp{};
665 alignas(64) std::array<size_t, vecLength> typeID2tmp{};
666 alignas(64) std::array<autopas::OwnershipState, vecLength> ownedStates2tmp{};
667
668 // load 4 neighbors
669 size_t j = 0;
670 // Loop over all neighbors as long as we can fill full vectors
671 // (until `neighborList.size() - neighborList.size() % vecLength`)
672 //
673 // If b is a power of 2 the following holds:
674 // a & ~(b - 1) == a - (a mod b)
675 for (; j < (neighborList.size() & ~(vecLength - 1)); j += vecLength) {
676 // AVX2 variant:
677 // create buffer for 4 interaction particles
678 // and fill buffers via gathering
679 // const __m256d x2tmp = _mm256_i64gather_pd(&xptr[j], _vindex, 1);
680 // const __m256d y2tmp = _mm256_i64gather_pd(&yptr[j], _vindex, 1);
681 // const __m256d z2tmp = _mm256_i64gather_pd(&zptr[j], _vindex, 1);
682 // const __m256d fx2tmp = _mm256_i64gather_pd(&fxptr[j], _vindex, 1);
683 // const __m256d fy2tmp = _mm256_i64gather_pd(&fyptr[j], _vindex, 1);
684 // const __m256d fz2tmp = _mm256_i64gather_pd(&fzptr[j], _vindex, 1);
685 // const __m256i typeID2tmp = _mm256_i64gather_epi64(&typeIDptr[j], _vindex, 1);
686
687 for (size_t vecIndex = 0; vecIndex < vecLength; ++vecIndex) {
688 x2tmp[vecIndex] = xptr[neighborList[j + vecIndex]];
689 y2tmp[vecIndex] = yptr[neighborList[j + vecIndex]];
690 z2tmp[vecIndex] = zptr[neighborList[j + vecIndex]];
691 if constexpr (newton3) {
692 fx2tmp[vecIndex] = fxptr[neighborList[j + vecIndex]];
693 fy2tmp[vecIndex] = fyptr[neighborList[j + vecIndex]];
694 fz2tmp[vecIndex] = fzptr[neighborList[j + vecIndex]];
695 }
696 typeID2tmp[vecIndex] = typeIDptr[neighborList[j + vecIndex]];
697 ownedStates2tmp[vecIndex] = ownedStatePtr[neighborList[j + vecIndex]];
698 }
699
700 SoAKernel<newton3, false>(0, ownedStateI, reinterpret_cast<const int64_t *>(ownedStates2tmp.data()), x1, y1, z1,
701 x2tmp.data(), y2tmp.data(), z2tmp.data(), fx2tmp.data(), fy2tmp.data(), fz2tmp.data(),
702 &typeIDptr[indexFirst], typeID2tmp.data(), fxacc, fyacc, fzacc, &virialSumX,
703 &virialSumY, &virialSumZ, &potentialEnergySum, 0);
704
705 if constexpr (newton3) {
706 for (size_t vecIndex = 0; vecIndex < vecLength; ++vecIndex) {
707 fxptr[neighborList[j + vecIndex]] = fx2tmp[vecIndex];
708 fyptr[neighborList[j + vecIndex]] = fy2tmp[vecIndex];
709 fzptr[neighborList[j + vecIndex]] = fz2tmp[vecIndex];
710 }
711 }
712 }
713 // Remainder loop
714 // If b is a power of 2 the following holds:
715 // a & (b - 1) == a mod b
716 const auto rest = static_cast<int>(neighborList.size() & (vecLength - 1));
717 if (rest > 0) {
718 // AVX2 variant:
719 // create buffer for 4 interaction particles
720 // and fill buffers via gathering
721 // TODO: use masked load because there will not be enough data left for the whole gather
722 // const __m256d x2tmp = _mm256_i64gather_pd(&xptr[j], _vindex, 1);
723 // const __m256d y2tmp = _mm256_i64gather_pd(&yptr[j], _vindex, 1);
724 // const __m256d z2tmp = _mm256_i64gather_pd(&zptr[j], _vindex, 1);
725 // const __m256d fx2tmp = _mm256_i64gather_pd(&fxptr[j], _vindex, 1);
726 // const __m256d fy2tmp = _mm256_i64gather_pd(&fyptr[j], _vindex, 1);
727 // const __m256d fz2tmp = _mm256_i64gather_pd(&fzptr[j], _vindex, 1);
728 // const __m256d typeID2tmp = _mm256_i64gather_pd(&typeIDptr[j], _vindex, 1);
729
730 for (size_t vecIndex = 0; vecIndex < rest; ++vecIndex) {
731 x2tmp[vecIndex] = xptr[neighborList[j + vecIndex]];
732 y2tmp[vecIndex] = yptr[neighborList[j + vecIndex]];
733 z2tmp[vecIndex] = zptr[neighborList[j + vecIndex]];
734 // if newton3 is used we need to load f of particle j so the kernel can update it too
735 if constexpr (newton3) {
736 fx2tmp[vecIndex] = fxptr[neighborList[j + vecIndex]];
737 fy2tmp[vecIndex] = fyptr[neighborList[j + vecIndex]];
738 fz2tmp[vecIndex] = fzptr[neighborList[j + vecIndex]];
739 }
740 typeID2tmp[vecIndex] = typeIDptr[neighborList[j + vecIndex]];
741 ownedStates2tmp[vecIndex] = ownedStatePtr[neighborList[j + vecIndex]];
742 }
743
744 SoAKernel<newton3, true>(0, ownedStateI, reinterpret_cast<const int64_t *>(ownedStates2tmp.data()), x1, y1, z1,
745 x2tmp.data(), y2tmp.data(), z2tmp.data(), fx2tmp.data(), fy2tmp.data(), fz2tmp.data(),
746 &typeIDptr[indexFirst], typeID2tmp.data(), fxacc, fyacc, fzacc, &virialSumX, &virialSumY,
747 &virialSumZ, &potentialEnergySum, rest);
748
749 if constexpr (newton3) {
750 for (size_t vecIndex = 0; vecIndex < rest; ++vecIndex) {
751 fxptr[neighborList[j + vecIndex]] = fx2tmp[vecIndex];
752 fyptr[neighborList[j + vecIndex]] = fy2tmp[vecIndex];
753 fzptr[neighborList[j + vecIndex]] = fz2tmp[vecIndex];
754 }
755 }
756 }
757
758 // horizontally reduce fDacc to sumfD
759 const __m256d hSumfxfy = _mm256_hadd_pd(fxacc, fyacc);
760 const __m256d hSumfz = _mm256_hadd_pd(fzacc, fzacc);
761
762 const __m128d hSumfxfyLow = _mm256_extractf128_pd(hSumfxfy, 0);
763 const __m128d hSumfzLow = _mm256_extractf128_pd(hSumfz, 0);
764
765 const __m128d hSumfxfyHigh = _mm256_extractf128_pd(hSumfxfy, 1);
766 const __m128d hSumfzHigh = _mm256_extractf128_pd(hSumfz, 1);
767
768 const __m128d sumfxfyVEC = _mm_add_pd(hSumfxfyLow, hSumfxfyHigh);
769 const __m128d sumfzVEC = _mm_add_pd(hSumfzLow, hSumfzHigh);
770
771 const double sumfx = sumfxfyVEC[0];
772 const double sumfy = sumfxfyVEC[1];
773 const double sumfz = _mm_cvtsd_f64(sumfzVEC);
774
775 fxptr[indexFirst] += sumfx;
776 fyptr[indexFirst] += sumfy;
777 fzptr[indexFirst] += sumfz;
778
779 if constexpr (calculateGlobals) {
780 const int threadnum = autopas::autopas_get_thread_num();
781
782 // horizontally reduce virialSumX and virialSumY
783 const __m256d hSumVirialxy = _mm256_hadd_pd(virialSumX, virialSumY);
784 const __m128d hSumVirialxyLow = _mm256_extractf128_pd(hSumVirialxy, 0);
785 const __m128d hSumVirialxyHigh = _mm256_extractf128_pd(hSumVirialxy, 1);
786 const __m128d hSumVirialxyVec = _mm_add_pd(hSumVirialxyHigh, hSumVirialxyLow);
787
788 // horizontally reduce virialSumZ and potentialEnergySum
789 const __m256d hSumVirialzPotentialEnergy = _mm256_hadd_pd(virialSumZ, potentialEnergySum);
790 const __m128d hSumVirialzPotentialEnergyLow = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 0);
791 const __m128d hSumVirialzPotentialEnergyHigh = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 1);
792 const __m128d hSumVirialzPotentialEnergyVec =
793 _mm_add_pd(hSumVirialzPotentialEnergyHigh, hSumVirialzPotentialEnergyLow);
794
795 // globals = {virialX, virialY, virialZ, potentialEnergy}
796 double globals[4];
797 _mm_store_pd(&globals[0], hSumVirialxyVec);
798 _mm_store_pd(&globals[2], hSumVirialzPotentialEnergyVec);
799
800 _aosThreadData[threadnum].virialSum[0] += globals[0];
801 _aosThreadData[threadnum].virialSum[1] += globals[1];
802 _aosThreadData[threadnum].virialSum[2] += globals[2];
803 _aosThreadData[threadnum].potentialEnergySum += globals[3];
804 }
805 // interact with i with 4 neighbors
806#endif // __AVX__
807 }
808
809 public:
813 constexpr static auto getNeededAttr() {
814 return std::array<typename Particle_T::AttributeNames, 9>{Particle_T::AttributeNames::id,
815 Particle_T::AttributeNames::posX,
816 Particle_T::AttributeNames::posY,
817 Particle_T::AttributeNames::posZ,
818 Particle_T::AttributeNames::forceX,
819 Particle_T::AttributeNames::forceY,
820 Particle_T::AttributeNames::forceZ,
821 Particle_T::AttributeNames::typeId,
822 Particle_T::AttributeNames::ownershipState};
823 }
824
828 constexpr static auto getNeededAttr(std::false_type) {
829 return std::array<typename Particle_T::AttributeNames, 6>{
830 Particle_T::AttributeNames::id, Particle_T::AttributeNames::posX,
831 Particle_T::AttributeNames::posY, Particle_T::AttributeNames::posZ,
832 Particle_T::AttributeNames::typeId, Particle_T::AttributeNames::ownershipState};
833 }
834
838 constexpr static auto getComputedAttr() {
839 return std::array<typename Particle_T::AttributeNames, 3>{
840 Particle_T::AttributeNames::forceX, Particle_T::AttributeNames::forceY, Particle_T::AttributeNames::forceZ};
841 }
842
847 constexpr static bool getMixing() { return useMixing; }
848
853 void initTraversal() final {
854 _potentialEnergySum = 0.;
855 _virialSum = {0., 0., 0.};
856 _postProcessed = false;
857 for (size_t i = 0; i < _aosThreadData.size(); ++i) {
858 _aosThreadData[i].setZero();
859 }
860 }
861
866 void endTraversal(bool newton3) final {
867 using namespace autopas::utils::ArrayMath::literals;
868
869 if (_postProcessed) {
871 "Already postprocessed, endTraversal(bool newton3) was called twice without calling initTraversal().");
872 }
873 if (calculateGlobals) {
874 for (size_t i = 0; i < _aosThreadData.size(); ++i) {
875 _potentialEnergySum += _aosThreadData[i].potentialEnergySum;
876 _virialSum += _aosThreadData[i].virialSum;
877 }
878 // For each interaction, we added the full contribution for both particles. Divide by 2 here, so that each
879 // contribution is only counted once per pair.
880 _potentialEnergySum *= 0.5;
881 _virialSum *= 0.5;
882
883 // We have always calculated 6*potentialEnergy, so we divide by 6 here!
884 _potentialEnergySum /= 6.;
885 _postProcessed = true;
886
887 AutoPasLog(DEBUG, "Final potential energy {}", _potentialEnergySum);
888 AutoPasLog(DEBUG, "Final virial {}", _virialSum[0] + _virialSum[1] + _virialSum[2]);
889 }
890 }
891
897 if (not calculateGlobals) {
899 "Trying to get potential energy even though calculateGlobals is false. If you want this functor to calculate "
900 "global "
901 "values, please specify calculateGlobals to be true.");
902 }
903 if (not _postProcessed) {
905 "Cannot get potential energy, because endTraversal was not called.");
906 }
907 return _potentialEnergySum;
908 }
909
914 double getVirial() {
915 if (not calculateGlobals) {
917 "Trying to get virial even though calculateGlobals is false. If you want this functor to calculate global "
918 "values, please specify calculateGlobals to be true.");
919 }
920 if (not _postProcessed) {
922 "Cannot get virial, because endTraversal was not called.");
923 }
924 return _virialSum[0] + _virialSum[1] + _virialSum[2];
925 }
926
935 void setParticleProperties(double epsilon24, double sigmaSquared) {
936#ifdef __AVX__
937 _epsilon24 = _mm256_set1_pd(epsilon24);
938 _sigmaSquared = _mm256_set1_pd(sigmaSquared);
939 if constexpr (applyShift) {
940 _shift6 = _mm256_set1_pd(
941 ParticlePropertiesLibrary<double, size_t>::calcShift6(epsilon24, sigmaSquared, _cutoffSquared[0]));
942 } else {
943 _shift6 = _mm256_setzero_pd();
944 }
945#endif
946
947 _epsilon24AoS = epsilon24;
948 _sigmaSquaredAoS = sigmaSquared;
949 if constexpr (applyShift) {
950 _shift6AoS = ParticlePropertiesLibrary<double, size_t>::calcShift6(epsilon24, sigmaSquared, _cutoffSquaredAoS);
951 } else {
952 _shift6AoS = 0.;
953 }
954 }
955
956 private:
957#ifdef __AVX__
965 inline __m256d wrapperFMA(const __m256d &factorA, const __m256d &factorB, const __m256d &summandC) {
966#ifdef __FMA__
967 return _mm256_fmadd_pd(factorA, factorB, summandC);
968#elif __AVX__
969 const __m256d tmp = _mm256_mul_pd(factorA, factorB);
970 return _mm256_add_pd(summandC, tmp);
971#else
972 // dummy return. If no vectorization is available this whole class is pointless anyways.
973 return __m256d();
974#endif
975 }
976#endif
977
981 class AoSThreadData {
982 public:
983 AoSThreadData() : virialSum{0., 0., 0.}, potentialEnergySum{0.}, __remainingTo64{} {}
984 void setZero() {
985 virialSum = {0., 0., 0.};
986 potentialEnergySum = 0.;
987 }
988
989 // variables
990 std::array<double, 3> virialSum;
991 double potentialEnergySum;
992
993 private:
994 // dummy parameter to get the right size (64 bytes)
995 double __remainingTo64[(64 - 4 * sizeof(double)) / sizeof(double)];
996 };
997 // make sure of the size of AoSThreadData
998 static_assert(sizeof(AoSThreadData) % 64 == 0, "AoSThreadData has wrong size");
999
1000#ifdef __AVX__
1001 const __m256d _zero{_mm256_set1_pd(0.)};
1002 const __m256d _one{_mm256_set1_pd(1.)};
1003 const __m256i _vindex = _mm256_set_epi64x(0, 1, 3, 4);
1004 const __m256i _masks[3]{
1005 _mm256_set_epi64x(0, 0, 0, -1),
1006 _mm256_set_epi64x(0, 0, -1, -1),
1007 _mm256_set_epi64x(0, -1, -1, -1),
1008 };
1009 const __m256i _ownedStateDummyMM256i{0x0};
1010 const __m256i _ownedStateOwnedMM256i{_mm256_set1_epi64x(static_cast<int64_t>(autopas::OwnershipState::owned))};
1011 const __m256d _cutoffSquared{};
1012 __m256d _shift6 = _mm256_setzero_pd();
1013 __m256d _epsilon24{};
1014 __m256d _sigmaSquared{};
1015#endif
1016
1017 const double _cutoffSquaredAoS = 0;
1018 double _epsilon24AoS, _sigmaSquaredAoS, _shift6AoS = 0;
1019
1020 ParticlePropertiesLibrary<double, size_t> *_PPLibrary = nullptr;
1021
1022 // sum of the potential energy, only calculated if calculateGlobals is true
1023 double _potentialEnergySum;
1024
1025 // sum of the virial, only calculated if calculateGlobals is true
1026 std::array<double, 3> _virialSum;
1027
1028 // thread buffer for aos
1029 std::vector<AoSThreadData> _aosThreadData;
1030
1031 // defines whether or whether not the global values are already preprocessed
1032 bool _postProcessed;
1033
1034 // number of double values that fit into a vector register.
1035 // MUST be power of 2 because some optimizations make this assumption
1036 constexpr static size_t vecLength = 4;
1037};
1038} // namespace mdLib
#define AutoPasLog(lvl, fmt,...)
Macro for logging providing common meta information without filename.
Definition: Logger.h:24
This class stores the (physical) properties of molecule types, and, in the case of multi-site molecul...
Definition: ParticlePropertiesLibrary.h:28
floatType getMixingShift6(intType i, intType j) const
Returns precomputed mixed shift * 6 for one pair of site types.
Definition: ParticlePropertiesLibrary.h:262
floatType getMixingSigmaSquared(intType i, intType j) const
Returns precomputed mixed squared sigma for one pair of site types.
Definition: ParticlePropertiesLibrary.h:252
floatType getMixing24Epsilon(intType i, intType j) const
Returns the precomputed mixed epsilon * 24.
Definition: ParticlePropertiesLibrary.h:224
static double calcShift6(double epsilon24, double sigmaSquared, double cutoffSquared)
Calculate the shift multiplied 6 of the lennard jones potential from given cutoff,...
Definition: ParticlePropertiesLibrary.h:576
AlignedAllocator class.
Definition: AlignedAllocator.h:29
Functor base class.
Definition: Functor.h:41
PairwiseFunctor class.
Definition: PairwiseFunctor.h:31
View on a fixed part of a SoA between a start index and an end index.
Definition: SoAView.h:23
size_t size() const
Returns the number of particles in the view.
Definition: SoAView.h:83
Default exception class for autopas exceptions.
Definition: ExceptionHandler.h:116
static void exception(const Exception e)
Handle an exception derived by std::exception.
Definition: ExceptionHandler.h:64
A functor to handle lennard-jones interactions between two particles (molecules).
Definition: LJFunctorAVX.h:50
bool allowsNewton3() final
Specifies whether the functor is capable of Newton3-like functors.
Definition: LJFunctorAVX.h:121
void initTraversal() final
Reset the global values.
Definition: LJFunctorAVX.h:853
LJFunctorAVX(double cutoff, ParticlePropertiesLibrary< double, size_t > &particlePropertiesLibrary)
Constructor for Functor with mixing active.
Definition: LJFunctorAVX.h:109
std::string getName() final
Returns name of functor.
Definition: LJFunctorAVX.h:117
double getVirial()
Get the virial.
Definition: LJFunctorAVX.h:914
double getPotentialEnergy()
Get the potential Energy.
Definition: LJFunctorAVX.h:896
static constexpr auto getComputedAttr()
Get attributes computed by this functor.
Definition: LJFunctorAVX.h:838
LJFunctorAVX(double cutoff)
Constructor for Functor with mixing disabled.
Definition: LJFunctorAVX.h:97
static constexpr bool getMixing()
Definition: LJFunctorAVX.h:847
void SoAFunctorSingle(autopas::SoAView< SoAArraysType > soa, bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorAVX.h:187
void SoAFunctorVerlet(autopas::SoAView< SoAArraysType > soa, const size_t indexFirst, const std::vector< size_t, autopas::AlignedAllocator< size_t > > &neighborList, bool newton3) final
PairwiseFunctor for structure of arrays (SoA) for neighbor lists.
Definition: LJFunctorAVX.h:611
void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final
PairwiseFunctor for arrays of structures (AoS).
Definition: LJFunctorAVX.h:129
void SoAFunctorPair(autopas::SoAView< SoAArraysType > soa1, autopas::SoAView< SoAArraysType > soa2, const bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorAVX.h:194
bool allowsNonNewton3() final
Specifies whether the functor is capable of non-Newton3-like functors.
Definition: LJFunctorAVX.h:125
static constexpr auto getNeededAttr()
Get attributes needed for computation.
Definition: LJFunctorAVX.h:813
static constexpr auto getNeededAttr(std::false_type)
Get attributes needed for computation without N3 optimization.
Definition: LJFunctorAVX.h:828
bool isRelevantForTuning() final
Specifies whether the functor should be considered for the auto-tuning process.
Definition: LJFunctorAVX.h:119
void setParticleProperties(double epsilon24, double sigmaSquared)
Sets the particle properties constants for this functor.
Definition: LJFunctorAVX.h:935
LJFunctorAVX()=delete
Deleted default constructor.
void endTraversal(bool newton3) final
Accumulates global values, e.g.
Definition: LJFunctorAVX.h:866
constexpr T dot(const std::array< T, SIZE > &a, const std::array< T, SIZE > &b)
Generates the dot product of two arrays.
Definition: ArrayMath.h:233
int autopas_get_max_threads()
Dummy for omp_get_max_threads() when no OpenMP is available.
Definition: WrapOpenMP.h:144
@ dummy
Dummy or deleted state, a particle with this state is not an actual particle!
@ owned
Owned state, a particle with this state is an actual particle and owned by the current AutoPas object...
FunctorN3Modes
Newton 3 modes for the Functor.
Definition: Functor.h:23
int autopas_get_thread_num()
Dummy for omp_set_lock() when no OpenMP is available.
Definition: WrapOpenMP.h:132