LIBINT  2.1.0-stable
vector_x86.h
1 /*
2  * This file is a part of Libint.
3  * Copyright (C) 2004-2014 Edward F. Valeev
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Library General Public License, version 2,
7  * as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public License
15  * along with this program. If not, see http://www.gnu.org/licenses/.
16  *
17  */
18 
19 #ifndef _libint2_src_lib_libint_vectorx86_h_
20 #define _libint2_src_lib_libint_vectorx86_h_
21 
22 #include <cstring>
23 #include <libint2/cxxstd.h>
24 #include <libint2/type_traits.h>
25 
26 #ifdef __SSE2__
27 
28 #include <emmintrin.h>
29 #include <immintrin.h>
30 #include <cmath>
31 #include <iostream>
32 
33 namespace libint2 { namespace simd {
34 
39  struct VectorSSEDouble {
40 
41  typedef double T;
42  __m128d d;
43 
48 
53  d = _mm_set_pd(a, a);
54  }
55 
59  VectorSSEDouble(T (&a)[2]) {
60  d = _mm_loadu_pd(&a[0]);
61  }
62 
66  VectorSSEDouble(T a0, T a1) {
67  d = _mm_set_pd(a1, a0);
68  }
69 
73  VectorSSEDouble(__m128d a) {
74  d = a;
75  }
76 
77  VectorSSEDouble& operator=(T a) {
78  d = _mm_set_pd(a, a);
79  return *this;
80  }
81 
82  VectorSSEDouble& operator+=(VectorSSEDouble a) {
83  d = _mm_add_pd(d, a.d);
84  return *this;
85  }
86 
87  VectorSSEDouble& operator-=(VectorSSEDouble a) {
88  d = _mm_sub_pd(d, a.d);
89  return *this;
90  }
91 
92  VectorSSEDouble operator-() const {
93  // TODO improve using bitflips
94  const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);;
95  VectorSSEDouble result;
96  result.d = _mm_mul_pd(this->d, minus_one);
97  return result;
98  }
99 
100 #if LIBINT2_CPLUSPLUS_STD >= 2011
101  explicit
102 #endif
103  operator double() const {
104  double d0[2];
105  ::memcpy(&(d0[0]), &d, sizeof(__m128d));
106  // this segfaults on OS X
107  //_mm_storeu_pd(&(d0[0]), d);
108 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
109 // alignas(__m128d) double d0[2];
110 // _mm_store_pd(&(d0[0]), d);
111  return d0[0];
112  }
113 
115  operator __m128d() const {
116  return d;
117  }
118 
120  void load(T const* a) {
121  d = _mm_loadu_pd(a);
122  }
125  void load_aligned(T const* a) {
126  d = _mm_load_pd(a);
127  }
129  void convert(T* a) const {
130  _mm_storeu_pd(&a[0], d);
131  }
134  void convert_aligned(T* a) const {
135  _mm_store_pd(&a[0], d);
136  }
137 
138  };
139 
141  inline VectorSSEDouble operator*(double a, VectorSSEDouble b) {
142  VectorSSEDouble c;
143  VectorSSEDouble _a(a);
144  c.d = _mm_mul_pd(_a.d, b.d);
145  return c;
146  }
147 
148  inline VectorSSEDouble operator*(VectorSSEDouble a, double b) {
149  VectorSSEDouble c;
150  VectorSSEDouble _b(b);
151  c.d = _mm_mul_pd(a.d, _b.d);
152  return c;
153  }
154 
155  inline VectorSSEDouble operator*(int a, VectorSSEDouble b) {
156  if (a == 1)
157  return b;
158  else {
159  VectorSSEDouble c;
160  VectorSSEDouble _a((double)a);
161  c.d = _mm_mul_pd(_a.d, b.d);
162  return c;
163  }
164  }
165 
166  inline VectorSSEDouble operator*(VectorSSEDouble a, int b) {
167  if (b == 1)
168  return a;
169  else {
170  VectorSSEDouble c;
171  VectorSSEDouble _b((double)b);
172  c.d = _mm_mul_pd(a.d, _b.d);
173  return c;
174  }
175  }
176 
178  VectorSSEDouble c;
179  c.d = _mm_mul_pd(a.d, b.d);
180  return c;
181  }
182 
183  inline VectorSSEDouble operator+(VectorSSEDouble a, VectorSSEDouble b) {
184  VectorSSEDouble c;
185  c.d = _mm_add_pd(a.d, b.d);
186  return c;
187  }
188 
189  inline VectorSSEDouble operator-(VectorSSEDouble a, VectorSSEDouble b) {
190  VectorSSEDouble c;
191  c.d = _mm_sub_pd(a.d, b.d);
192  return c;
193  }
194 
195  inline VectorSSEDouble operator/(VectorSSEDouble a, VectorSSEDouble b) {
196  VectorSSEDouble c;
197  c.d = _mm_div_pd(a.d, b.d);
198  return c;
199  }
200 
201 #if defined(__FMA__)
203  VectorSSEDouble d;
204  d.d = _mm_fmadd_pd(a.d, b.d, c.d);
205  return d;
206  }
208  VectorSSEDouble d;
209  d.d = _mm_fmsub_pd(a.d, b.d, c.d);
210  return d;
211  }
212 #elif defined(__FMA4__)
214  VectorSSEDouble d;
215  d.d = _mm_macc_pd(a.d, b.d, c.d);
216  return d;
217  }
219  VectorSSEDouble d;
220  d.d = _mm_msub_pd(a.d, b.d, c.d);
221  return d;
222  }
223 #endif
224 
228  inline double horizontal_add (VectorSSEDouble const & a) {
229 // Agner Fog's version
230 #if defined(__SSE3__)
231  __m128d t1 = _mm_hadd_pd(a,a);
232  return _mm_cvtsd_f64(t1);
233 #else // SSE2 only
234  __m128 t0 = _mm_castpd_ps(a);
235  __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
236  __m128d t2 = _mm_add_sd(a,t1);
237  return _mm_cvtsd_f64(t2);
238 #endif
239  }
240 
246 #if defined(__SSE3__)
247  return _mm_hadd_pd(a, b);
248 #else // will be very inefficient without SSE3
250 #endif
251  }
252 
254 
256  inline VectorSSEDouble exp(VectorSSEDouble a) {
257 #if HAVE_INTEL_SVML
258  VectorSSEDouble result;
259  result.d = _mm_exp_pd(a.d);
260 #else
261  double a_d[2]; a.convert(a_d);
262  for(int i=0; i<2; ++i) a_d[i] = std::exp(a_d[i]);
263  VectorSSEDouble result(a_d);
264 #endif
265  return result;
266  }
267  inline VectorSSEDouble sqrt(VectorSSEDouble a) {
268 #if HAVE_INTEL_SVML
269  VectorSSEDouble result;
270  result.d = _mm_sqrt_pd(a.d);
271 #else
272  double a_d[2]; a.convert(a_d);
273  for(int i=0; i<2; ++i) a_d[i] = std::sqrt(a_d[i]);
274  VectorSSEDouble result(a_d);
275 #endif
276  return result;
277  }
278  inline VectorSSEDouble erf(VectorSSEDouble a) {
279 #if HAVE_INTEL_SVML
280  VectorSSEDouble result;
281  result.d = _mm_erf_pd(a.d);
282 #else
283  double a_d[2]; a.convert(a_d);
284  for(int i=0; i<2; ++i) a_d[i] = ::erf(a_d[i]);
285  VectorSSEDouble result(a_d);
286 #endif
287  return result;
288  }
289  inline VectorSSEDouble erfc(VectorSSEDouble a) {
290 #if HAVE_INTEL_SVML
291  VectorSSEDouble result;
292  result.d = _mm_erfc_pd(a.d);
293 #else
294  double a_d[2]; a.convert(a_d);
295  for(int i=0; i<2; ++i) a_d[i] = ::erfc(a_d[i]);
296  VectorSSEDouble result(a_d);
297 #endif
298  return result;
299  }
301 
302 };}; // namespace libint2::simd
303 
305 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorSSEDouble a) {
306  double ad[2];
307  a.convert(ad);
308  os << "{" << ad[0] << "," << ad[1] << "}";
309  return os;
310 }
312 
313 namespace libint2 {
314 
316 
317  template <>
318  struct is_vector<simd::VectorSSEDouble> {
319  static const bool value = true;
320  };
321 
322  template <>
324  typedef double value_type;
325  static const size_t extent = 2;
326  };
327 
329 
330 } // namespace libint2
331 
332 #endif // SSE2-only
333 
334 #ifdef __SSE__
335 
336 #include <xmmintrin.h>
337 
338 namespace libint2 { namespace simd {
339 
344  struct VectorSSEFloat {
345 
346  typedef float T;
347  __m128 d;
348 
353 
358  d = _mm_set_ps(a, a, a, a);
359  }
360 
364  VectorSSEFloat(T (&a)[4]) {
365  d = _mm_loadu_ps(&a[0]);
366  }
367 
371  VectorSSEFloat(T a0, T a1, T a2, T a3) {
372  d = _mm_set_ps(a3, a2, a1, a0);
373  }
374 
375  VectorSSEFloat& operator=(T a) {
376  d = _mm_set_ps(a, a, a, a);
377  return *this;
378  }
379 
380  VectorSSEFloat& operator+=(VectorSSEFloat a) {
381  d = _mm_add_ps(d, a.d);
382  return *this;
383  }
384 
385  VectorSSEFloat& operator-=(VectorSSEFloat a) {
386  d = _mm_sub_ps(d, a.d);
387  return *this;
388  }
389 
390  VectorSSEFloat operator-() const {
391  // TODO improve using bitflips
392  const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);;
393  VectorSSEFloat result;
394  result.d = _mm_mul_ps(this->d, minus_one);
395  return result;
396  }
397 
398 #if LIBINT2_CPLUSPLUS_STD >= 2011
399  explicit
400 #endif
401  operator float() const {
402  float d0[4];
403  ::memcpy(&(d0[0]), &d, sizeof(__m128));
404  // this segfaults on OS X
405  //_mm_storeu_ps(&(d0[0]), d);
406 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
407 // alignas(__m128) float d0[4];
408 // _mm_store_ps(&(d0[0]), d);
409  return d0[0];
410  }
411 
412 #if LIBINT2_CPLUSPLUS_STD >= 2011
413  explicit
414 #endif
415  operator double() const {
416  const float result_flt = this->operator float();
417  return result_flt;
418  }
419 
421  operator __m128() const {
422  return d;
423  }
424 
426  void load(T const* a) {
427  d = _mm_loadu_ps(a);
428  }
431  void load_aligned(T const* a) {
432  d = _mm_load_ps(a);
433  }
435  void convert(T* a) const {
436  _mm_storeu_ps(&a[0], d);
437  }
440  void convert_aligned(T* a) const {
441  _mm_store_ps(&a[0], d);
442  }
443  };
444 
446  inline VectorSSEFloat operator*(float a, VectorSSEFloat b) {
447  VectorSSEFloat c;
448  VectorSSEFloat _a(a);
449  c.d = _mm_mul_ps(_a.d, b.d);
450  return c;
451  }
452 
453  inline VectorSSEFloat operator*(VectorSSEFloat a, float b) {
454  VectorSSEFloat c;
455  VectorSSEFloat _b(b);
456  c.d = _mm_mul_ps(a.d, _b.d);
457  return c;
458  }
459 
460  // narrows a!
461  inline VectorSSEFloat operator*(double a, VectorSSEFloat b) {
462  VectorSSEFloat c;
463  VectorSSEFloat _a((float)a);
464  c.d = _mm_mul_ps(_a.d, b.d);
465  return c;
466  }
467 
468  // narrows b!
469  inline VectorSSEFloat operator*(VectorSSEFloat a, double b) {
470  VectorSSEFloat c;
471  VectorSSEFloat _b((float)b);
472  c.d = _mm_mul_ps(a.d, _b.d);
473  return c;
474  }
475 
476 
477  inline VectorSSEFloat operator*(int a, VectorSSEFloat b) {
478  if (a == 1)
479  return b;
480  else {
481  VectorSSEFloat c;
482  VectorSSEFloat _a((float)a);
483  c.d = _mm_mul_ps(_a.d, b.d);
484  return c;
485  }
486  }
487 
488  inline VectorSSEFloat operator*(VectorSSEFloat a, int b) {
489  if (b == 1)
490  return a;
491  else {
492  VectorSSEFloat c;
493  VectorSSEFloat _b((float)b);
494  c.d = _mm_mul_ps(a.d, _b.d);
495  return c;
496  }
497  }
498 
500  VectorSSEFloat c;
501  c.d = _mm_mul_ps(a.d, b.d);
502  return c;
503  }
504 
505  inline VectorSSEFloat operator+(VectorSSEFloat a, VectorSSEFloat b) {
506  VectorSSEFloat c;
507  c.d = _mm_add_ps(a.d, b.d);
508  return c;
509  }
510 
511  inline VectorSSEFloat operator-(VectorSSEFloat a, VectorSSEFloat b) {
512  VectorSSEFloat c;
513  c.d = _mm_sub_ps(a.d, b.d);
514  return c;
515  }
516 
517  inline VectorSSEFloat operator/(VectorSSEFloat a, VectorSSEFloat b) {
518  VectorSSEFloat c;
519  c.d = _mm_div_ps(a.d, b.d);
520  return c;
521  }
522 
523 #if defined(__FMA__)
525  VectorSSEFloat d;
526  d.d = _mm_fmadd_ps(a.d, b.d, c.d);
527  return d;
528  }
530  VectorSSEFloat d;
531  d.d = _mm_fmsub_ps(a.d, b.d, c.d);
532  return d;
533  }
534 #elif defined(__FMA4__)
536  VectorSSEFloat d;
537  d.d = _mm_macc_ps(a.d, b.d, c.d);
538  return d;
539  }
541  VectorSSEFloat d;
542  d.d = _mm_msub_ps(a.d, b.d, c.d);
543  return d;
544  }
545 #endif
546 
548 
550  inline VectorSSEFloat exp(VectorSSEFloat a) {
551 #if HAVE_INTEL_SVML
552  VectorSSEFloat result;
553  result.d = _mm_exp_ps(a.d);
554 #else
555  float a_d[4]; a.convert(a_d);
556  for(int i=0; i<4; ++i) a_d[i] = std::exp(a_d[i]);
557  VectorSSEFloat result(a_d);
558 #endif
559  return result;
560  }
561  inline VectorSSEFloat sqrt(VectorSSEFloat a) {
562 #if HAVE_INTEL_SVML
563  VectorSSEFloat result;
564  result.d = _mm_sqrt_ps(a.d);
565 #else
566  float a_d[4]; a.convert(a_d);
567  for(int i=0; i<4; ++i) a_d[i] = std::sqrt(a_d[i]);
568  VectorSSEFloat result(a_d);
569 #endif
570  return result;
571  }
572  inline VectorSSEFloat erf(VectorSSEFloat a) {
573 #if HAVE_INTEL_SVML
574  VectorSSEFloat result;
575  result.d = _mm_erf_ps(a.d);
576 #else
577  float a_d[4]; a.convert(a_d);
578  for(int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
579  VectorSSEFloat result(a_d);
580 #endif
581  return result;
582  }
583  inline VectorSSEFloat erfc(VectorSSEFloat a) {
584 #if HAVE_INTEL_SVML
585  VectorSSEFloat result;
586  result.d = _mm_erfc_ps(a.d);
587 #else
588  float a_d[4]; a.convert(a_d);
589  for(int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
590  VectorSSEFloat result(a_d);
591 #endif
592  return result;
593  }
595 
596 };}; // namespace libint2::simd
597 
599 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorSSEFloat a) {
600  float ad[4];
601  a.convert(ad);
602  os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
603  return os;
604 }
606 
607 namespace libint2 {
608 
610 
611  template <>
612  struct is_vector<simd::VectorSSEFloat> {
613  static const bool value = true;
614  };
615 
616  template <>
617  struct vector_traits<simd::VectorSSEFloat> {
618  typedef float value_type;
619  static const size_t extent = 4;
620  };
621 
623 
624 } // namespace libint2
625 
626 #endif // SSE-only
627 
628 #ifdef __AVX__
629 
630 #include <immintrin.h>
631 
632 namespace libint2 { namespace simd {
633 
640 
641  typedef double T;
642  __m256d d;
643 
648 
653  d = _mm256_set_pd(a, a, a, a);
654  }
655 
659  VectorAVXDouble(T (&a)[4]) {
660  d = _mm256_loadu_pd(&a[0]);
661  }
662 
666  VectorAVXDouble(T a0, T a1, T a2, T a3) {
667  d = _mm256_set_pd(a3, a2, a1, a0);
668  }
669 
673  VectorAVXDouble(__m256d a) {
674  d = a;
675  }
676 
677  VectorAVXDouble& operator=(T a) {
678  d = _mm256_set_pd(a, a, a, a);
679  return *this;
680  }
681 
682  VectorAVXDouble& operator+=(VectorAVXDouble a) {
683  d = _mm256_add_pd(d, a.d);
684  return *this;
685  }
686 
687  VectorAVXDouble& operator-=(VectorAVXDouble a) {
688  d = _mm256_sub_pd(d, a.d);
689  return *this;
690  }
691 
692  VectorAVXDouble operator-() const {
693  // TODO improve using bitflips
694  const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);;
695  VectorAVXDouble result;
696  result.d = _mm256_mul_pd(this->d, minus_one);
697  return result;
698  }
699 
700 #if LIBINT2_CPLUSPLUS_STD >= 2011
701  explicit
702 #endif
703  operator double() const {
704  double d0[4];
705  ::memcpy(&(d0[0]), &d, sizeof(__m256d));
706  // this segfaults on OS X
707 // _mm256_storeu_pd(&d0[0], d);
708 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
709 // // alignas(__m256d) double d0[4];
710 // // _mm256_store_pd(&(d0[0]), d);
711  return d0[0];
712  }
713 
715  operator __m256d() const {
716  return d;
717  }
718 
720  void load(T const* a) {
721  d = _mm256_loadu_pd(a);
722  }
725  void load_aligned(T const* a) {
726  d = _mm256_load_pd(a);
727  }
729  void convert(T* a) const {
730  _mm256_storeu_pd(&a[0], d);
731  }
734  void convert_aligned(T* a) const {
735  _mm256_store_pd(&a[0], d);
736  }
737  };
738 
740  inline VectorAVXDouble operator*(double a, VectorAVXDouble b) {
741  VectorAVXDouble c;
742  VectorAVXDouble _a(a);
743  c.d = _mm256_mul_pd(_a.d, b.d);
744  return c;
745  }
746 
747  inline VectorAVXDouble operator*(VectorAVXDouble a, double b) {
748  VectorAVXDouble c;
749  VectorAVXDouble _b(b);
750  c.d = _mm256_mul_pd(a.d, _b.d);
751  return c;
752  }
753 
754  inline VectorAVXDouble operator*(int a, VectorAVXDouble b) {
755  if (a == 1)
756  return b;
757  else {
758  VectorAVXDouble c;
759  VectorAVXDouble _a((double)a);
760  c.d = _mm256_mul_pd(_a.d, b.d);
761  return c;
762  }
763  }
764 
765  inline VectorAVXDouble operator*(VectorAVXDouble a, int b) {
766  if (b == 1)
767  return a;
768  else {
769  VectorAVXDouble c;
770  VectorAVXDouble _b((double)b);
771  c.d = _mm256_mul_pd(a.d, _b.d);
772  return c;
773  }
774  }
775 
777  VectorAVXDouble c;
778  c.d = _mm256_mul_pd(a.d, b.d);
779  return c;
780  }
781 
782  inline VectorAVXDouble operator+(VectorAVXDouble a, VectorAVXDouble b) {
783  VectorAVXDouble c;
784  c.d = _mm256_add_pd(a.d, b.d);
785  return c;
786  }
787 
788  inline VectorAVXDouble operator+(int a, VectorAVXDouble b) {
789  if(a == 0){
790  return b;
791  }
792  else {
793  VectorAVXDouble c;
794  VectorAVXDouble _a = (static_cast<double>(a));
795  c.d = _mm256_add_pd(_a.d,b.d);
796  return c;
797  }
798  }
799 
800 
801  inline VectorAVXDouble operator+(VectorAVXDouble a, int b) {
802  if(b == 0){
803  return a;
804  }
805  else {
806  VectorAVXDouble c;
807  VectorAVXDouble _b = (static_cast<double>(b));
808  c.d = _mm256_add_pd(a.d,_b.d);
809  return c;
810  }
811  }
812 
813 
814  inline VectorAVXDouble operator-(VectorAVXDouble a, VectorAVXDouble b) {
815  VectorAVXDouble c;
816  c.d = _mm256_sub_pd(a.d, b.d);
817  return c;
818  }
819 
820  inline VectorAVXDouble operator/(VectorAVXDouble a, VectorAVXDouble b) {
821  VectorAVXDouble c;
822  c.d = _mm256_div_pd(a.d, b.d);
823  return c;
824  }
825 
826 #if defined(__FMA__)
828  VectorAVXDouble d;
829  d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
830  return d;
831  }
833  VectorAVXDouble d;
834  d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
835  return d;
836  }
837 #elif defined(__FMA4__)
839  VectorAVXDouble d;
840  d.d = _mm256_facc_pd(a.d, b.d, c.d);
841  return d;
842  }
844  VectorAVXDouble d;
845  d.d = _mm256_fsub_pd(a.d, b.d, c.d);
846  return d;
847  }
848 #endif
849 
853  inline double horizontal_add (VectorAVXDouble const & a) {
854  __m256d s = _mm256_hadd_pd(a,a);
855  return ((double*)&s)[0] + ((double*)&s)[2];
856 // Agner Fog's version
857 // __m256d t1 = _mm256_hadd_pd(a,a);
858 // __m128d t2 = _mm256_extractf128_pd(t1,1);
859 // __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
860 // return _mm_cvtsd_f64(t3);
861  }
862 
868  // solution from http://stackoverflow.com/questions/9775538/fastest-way-to-do-horizontal-vector-sum-with-avx-instructions
869  __m256d sum = _mm256_hadd_pd(a, b);
870  // extract upper 128 bits of result
871  __m128d sum_high = _mm256_extractf128_pd(sum, 1);
872  // add upper 128 bits of sum to its lower 128 bits
873  return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
874  }
875 
883  VectorAVXDouble const & b,
884  VectorAVXDouble const & c,
885  VectorAVXDouble const & d) {
886  // solution from http://stackoverflow.com/questions/10833234/4-horizontal-double-precision-sums-in-one-go-with-avx?lq=1
887  // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
888  __m256d sumab = _mm256_hadd_pd(a, b);
889  // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
890  __m256d sumcd = _mm256_hadd_pd(c, d);
891 
892  // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
893  __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
894  // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
895  __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
896 
897  return _mm256_add_pd(perm, blend);
898  }
899 
901 
903  inline VectorAVXDouble exp(VectorAVXDouble a) {
904 #if HAVE_INTEL_SVML
905  VectorAVXDouble result;
906  result.d = _mm256_exp_pd(a.d);
907 #else
908  double a_d[4]; a.convert(a_d);
909  for(int i=0; i<4; ++i) a_d[i] = ::exp(a_d[i]);
910  VectorAVXDouble result(a_d);
911 #endif
912  return result;
913  }
914  inline VectorAVXDouble sqrt(VectorAVXDouble a) {
915 #if HAVE_INTEL_SVML
916  VectorAVXDouble result;
917  result.d = _mm256_sqrt_pd(a.d);
918 #else
919  double a_d[4]; a.convert(a_d);
920  for(int i=0; i<4; ++i) a_d[i] = ::sqrt(a_d[i]);
921  VectorAVXDouble result(a_d);
922 #endif
923  return result;
924  }
925  inline VectorAVXDouble erf(VectorAVXDouble a) {
926 #if HAVE_INTEL_SVML
927  VectorAVXDouble result;
928  result.d = _mm256_erf_pd(a.d);
929 #else
930  double a_d[4]; a.convert(a_d);
931  for(int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
932  VectorAVXDouble result(a_d);
933 #endif
934  return result;
935  }
936  inline VectorAVXDouble erfc(VectorAVXDouble a) {
937 #if HAVE_INTEL_SVML
938  VectorAVXDouble result;
939  result.d = _mm256_erfc_pd(a.d);
940 #else
941  double a_d[4]; a.convert(a_d);
942  for(int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
943  VectorAVXDouble result(a_d);
944 #endif
945  return result;
946  }
948 
949 };}; // namespace libint2::simd
950 
952 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorAVXDouble a) {
953  double ad[4];
954  a.convert(ad);
955  os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
956  return os;
957 }
959 
960 namespace libint2 {
961 
963 
964  template <>
965  struct is_vector<simd::VectorAVXDouble> {
966  static const bool value = true;
967  };
968 
969  template <>
970  struct vector_traits<simd::VectorAVXDouble> {
971  typedef double value_type;
972  static const size_t extent = 4;
973  };
974 
976 
977 } // namespace libint2
978 
979 #endif // AVX-only
980 
981 #ifdef LIBINT2_HAVE_AGNER_VECTORCLASS
982 #include <vectorclass.h>
983 #endif
984 
985 #endif // header guard
986 
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:228
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition: vector_x86.h:39
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:440
Z fma_minus(X x, Y y, Z z)
Definition: intrinsic_operations.h:36
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
VectorAVXDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:647
VectorSSEFloat(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:364
Definition: type_traits.h:30
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:23
VectorSSEDouble(T a0, T a1)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:66
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition: vector_x86.h:673
VectorSSEFloat(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:357
VectorSSEDouble(T(&a)[2])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:59
SafePtr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const SafePtr< CTimeEntity< T > > &A, const SafePtr< CTimeEntity< U > > &B)
Creates product A*B.
Definition: entity.h:277
VectorSSEDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:52
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:134
VectorSSEDouble(__m128d a)
converts a 128-bit SSE double vector type to VectorSSEDouble
Definition: vector_x86.h:73
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:659
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
Z fma_plus(X x, Y y, Z z)
Definition: intrinsic_operations.h:30
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:734
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
VectorSSEDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:47
void load(T const *a)
loads a to this
Definition: vector_x86.h:426
void convert(T *a) const
writes this to a
Definition: vector_x86.h:129
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:431
VectorSSEFloat()
creates a vector of default-initialized values.
Definition: vector_x86.h:352
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:652
SIMD vector of 4 single-precision floating-point real numbers, operations on which use SSE instructio...
Definition: vector_x86.h:344
void load(T const *a)
loads a to this
Definition: vector_x86.h:120
void convert(T *a) const
writes this to a
Definition: vector_x86.h:435
Definition: type_traits.h:25
VectorSSEFloat(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:371
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:125
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:666