19 #ifndef _libint2_src_lib_libint_vectorx86_h_ 20 #define _libint2_src_lib_libint_vectorx86_h_ 23 #include <libint2/cxxstd.h> 24 #include <libint2/type_traits.h> 28 #include <emmintrin.h> 29 #include <immintrin.h> 33 namespace libint2 {
namespace simd {
60 d = _mm_loadu_pd(&a[0]);
67 d = _mm_set_pd(a1, a0);
83 d = _mm_add_pd(d, a.d);
88 d = _mm_sub_pd(d, a.d);
94 const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);;
96 result.d = _mm_mul_pd(this->d, minus_one);
100 #if LIBINT2_CPLUSPLUS_STD >= 2011 103 operator double()
const {
105 ::memcpy(&(d0[0]), &d,
sizeof(__m128d));
115 operator __m128d()
const {
130 _mm_storeu_pd(&a[0], d);
135 _mm_store_pd(&a[0], d);
144 c.d = _mm_mul_pd(_a.d, b.d);
151 c.d = _mm_mul_pd(a.d, _b.d);
161 c.d = _mm_mul_pd(_a.d, b.d);
172 c.d = _mm_mul_pd(a.d, _b.d);
179 c.d = _mm_mul_pd(a.d, b.d);
185 c.d = _mm_add_pd(a.d, b.d);
191 c.d = _mm_sub_pd(a.d, b.d);
197 c.d = _mm_div_pd(a.d, b.d);
204 d.d = _mm_fmadd_pd(a.d, b.d, c.d);
209 d.d = _mm_fmsub_pd(a.d, b.d, c.d);
212 #elif defined(__FMA4__) 215 d.d = _mm_macc_pd(a.d, b.d, c.d);
220 d.d = _mm_msub_pd(a.d, b.d, c.d);
230 #if defined(__SSE3__) 231 __m128d t1 = _mm_hadd_pd(a,a);
232 return _mm_cvtsd_f64(t1);
234 __m128 t0 = _mm_castpd_ps(a);
235 __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
236 __m128d t2 = _mm_add_sd(a,t1);
237 return _mm_cvtsd_f64(t2);
246 #if defined(__SSE3__) 247 return _mm_hadd_pd(a, b);
248 #else // will be very inefficient without SSE3 259 result.d = _mm_exp_pd(a.d);
262 for(
int i=0; i<2; ++i) a_d[i] = std::exp(a_d[i]);
270 result.d = _mm_sqrt_pd(a.d);
273 for(
int i=0; i<2; ++i) a_d[i] = std::sqrt(a_d[i]);
281 result.d = _mm_erf_pd(a.d);
284 for(
int i=0; i<2; ++i) a_d[i] = ::erf(a_d[i]);
292 result.d = _mm_erfc_pd(a.d);
295 for(
int i=0; i<2; ++i) a_d[i] = ::erfc(a_d[i]);
308 os <<
"{" << ad[0] <<
"," << ad[1] <<
"}";
319 static const bool value =
true;
324 typedef double value_type;
325 static const size_t extent = 2;
336 #include <xmmintrin.h> 338 namespace libint2 {
namespace simd {
358 d = _mm_set_ps(a, a, a, a);
365 d = _mm_loadu_ps(&a[0]);
372 d = _mm_set_ps(a3, a2, a1, a0);
376 d = _mm_set_ps(a, a, a, a);
381 d = _mm_add_ps(d, a.d);
386 d = _mm_sub_ps(d, a.d);
392 const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);;
394 result.d = _mm_mul_ps(this->d, minus_one);
398 #if LIBINT2_CPLUSPLUS_STD >= 2011 401 operator float()
const {
403 ::memcpy(&(d0[0]), &d,
sizeof(__m128));
412 #if LIBINT2_CPLUSPLUS_STD >= 2011 415 operator double()
const {
416 const float result_flt = this->
operator float();
421 operator __m128()
const {
436 _mm_storeu_ps(&a[0], d);
441 _mm_store_ps(&a[0], d);
449 c.d = _mm_mul_ps(_a.d, b.d);
456 c.d = _mm_mul_ps(a.d, _b.d);
464 c.d = _mm_mul_ps(_a.d, b.d);
472 c.d = _mm_mul_ps(a.d, _b.d);
483 c.d = _mm_mul_ps(_a.d, b.d);
494 c.d = _mm_mul_ps(a.d, _b.d);
501 c.d = _mm_mul_ps(a.d, b.d);
507 c.d = _mm_add_ps(a.d, b.d);
513 c.d = _mm_sub_ps(a.d, b.d);
519 c.d = _mm_div_ps(a.d, b.d);
526 d.d = _mm_fmadd_ps(a.d, b.d, c.d);
531 d.d = _mm_fmsub_ps(a.d, b.d, c.d);
534 #elif defined(__FMA4__) 537 d.d = _mm_macc_ps(a.d, b.d, c.d);
542 d.d = _mm_msub_ps(a.d, b.d, c.d);
553 result.d = _mm_exp_ps(a.d);
556 for(
int i=0; i<4; ++i) a_d[i] = std::exp(a_d[i]);
564 result.d = _mm_sqrt_ps(a.d);
567 for(
int i=0; i<4; ++i) a_d[i] = std::sqrt(a_d[i]);
575 result.d = _mm_erf_ps(a.d);
578 for(
int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
586 result.d = _mm_erfc_ps(a.d);
589 for(
int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
602 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"}";
613 static const bool value =
true;
618 typedef float value_type;
619 static const size_t extent = 4;
630 #include <immintrin.h> 632 namespace libint2 {
namespace simd {
653 d = _mm256_set_pd(a, a, a, a);
660 d = _mm256_loadu_pd(&a[0]);
667 d = _mm256_set_pd(a3, a2, a1, a0);
678 d = _mm256_set_pd(a, a, a, a);
683 d = _mm256_add_pd(d, a.d);
688 d = _mm256_sub_pd(d, a.d);
694 const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);;
696 result.d = _mm256_mul_pd(this->d, minus_one);
700 #if LIBINT2_CPLUSPLUS_STD >= 2011 703 operator double()
const {
705 ::memcpy(&(d0[0]), &d,
sizeof(__m256d));
715 operator __m256d()
const {
721 d = _mm256_loadu_pd(a);
726 d = _mm256_load_pd(a);
730 _mm256_storeu_pd(&a[0], d);
735 _mm256_store_pd(&a[0], d);
743 c.d = _mm256_mul_pd(_a.d, b.d);
750 c.d = _mm256_mul_pd(a.d, _b.d);
760 c.d = _mm256_mul_pd(_a.d, b.d);
771 c.d = _mm256_mul_pd(a.d, _b.d);
778 c.d = _mm256_mul_pd(a.d, b.d);
784 c.d = _mm256_add_pd(a.d, b.d);
795 c.d = _mm256_add_pd(_a.d,b.d);
808 c.d = _mm256_add_pd(a.d,_b.d);
816 c.d = _mm256_sub_pd(a.d, b.d);
822 c.d = _mm256_div_pd(a.d, b.d);
829 d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
834 d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
837 #elif defined(__FMA4__) 840 d.d = _mm256_facc_pd(a.d, b.d, c.d);
845 d.d = _mm256_fsub_pd(a.d, b.d, c.d);
854 __m256d s = _mm256_hadd_pd(a,a);
855 return ((
double*)&s)[0] + ((
double*)&s)[2];
869 __m256d sum = _mm256_hadd_pd(a, b);
871 __m128d sum_high = _mm256_extractf128_pd(sum, 1);
873 return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
888 __m256d sumab = _mm256_hadd_pd(a, b);
890 __m256d sumcd = _mm256_hadd_pd(c, d);
893 __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
895 __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
897 return _mm256_add_pd(perm, blend);
906 result.d = _mm256_exp_pd(a.d);
909 for(
int i=0; i<4; ++i) a_d[i] = ::exp(a_d[i]);
917 result.d = _mm256_sqrt_pd(a.d);
920 for(
int i=0; i<4; ++i) a_d[i] = ::sqrt(a_d[i]);
928 result.d = _mm256_erf_pd(a.d);
931 for(
int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
939 result.d = _mm256_erfc_pd(a.d);
942 for(
int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
955 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"}";
966 static const bool value =
true;
971 typedef double value_type;
972 static const size_t extent = 4;
981 #ifdef LIBINT2_HAVE_AGNER_VECTORCLASS 982 #include <vectorclass.h> 985 #endif // header guard double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:228
Z fma_minus(X x, Y y, Z z)
Definition: intrinsic_operations.h:36
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
VectorAVXDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:647
Definition: type_traits.h:30
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:23
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition: vector_x86.h:673
SafePtr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const SafePtr< CTimeEntity< T > > &A, const SafePtr< CTimeEntity< U > > &B)
Creates product A*B.
Definition: entity.h:277
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:659
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
Z fma_plus(X x, Y y, Z z)
Definition: intrinsic_operations.h:30
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:734
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:652
Definition: type_traits.h:25
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:666