LIBINT  2.1.0-stable
boys.h
1 /*
2  * This file is a part of Libint.
3  * Copyright (C) 2004-2014 Edward F. Valeev
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Library General Public License, version 2,
7  * as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public License
15  * along with this program. If not, see http://www.gnu.org/licenses/.
16  *
17  */
18 
19 // prototype for the Boys function engines (Boys function = Fm(T))
20 // the Chebyshev extrapolation code is based on that by Frank Neese
21 
22 #ifndef _libint2_src_lib_libint_boys_h_
23 #define _libint2_src_lib_libint_boys_h_
24 
25 #if defined(__cplusplus)
26 
27 #include <iostream>
28 #include <cstdlib>
29 #include <cmath>
30 #include <stdexcept>
31 #include <libint2/vector.h>
32 #include <cassert>
33 #include <vector>
34 #include <algorithm>
35 #include <limits>
36 #include <type_traits>
37 
38 // some features require at least C++11
39 #include <libint2/cxxstd.h>
40 #if LIBINT2_CPLUSPLUS_STD >= 2011
41 #include <memory>
42 #endif
43 
44 #if HAVE_LAPACK // use F77-type interface for now, switch to LAPACKE later
45 extern "C" void dgesv_(const int* n,
46  const int* nrhs, double* A, const int* lda,
47  int* ipiv, double* b, const int* ldb,
48  int* info);
49 #endif
50 
51 namespace libint2 {
52 
54  template<typename Real>
56  public:
57  ExpensiveNumbers(int ifac = -1, int idf = -1, int ibc = -1) {
58  if (ifac >= 0) {
59  fac.resize(ifac + 1);
60  fac[0] = 1.0;
61  for (int i = 1; i <= ifac; i++) {
62  fac[i] = i * fac[i - 1];
63  }
64  }
65 
66  if (idf >= 0) {
67  df.resize(idf + 1);
68  /* df[i] gives (i-1)!!, so that (-1)!! is defined... */
69  df[0] = 1.0;
70  if (idf >= 1)
71  df[1] = 1.0;
72  if (idf >= 2)
73  df[2] = 1.0;
74  for (int i = 3; i <= idf; i++) {
75  df[i] = (i - 1) * df[i - 2];
76  }
77  }
78 
79  if (ibc >= 0) {
80  bc_.resize((ibc+1)*(ibc+1));
81  std::fill(bc_.begin(), bc_.end(), Real(0));
82  bc.resize(ibc+1);
83  bc[0] = &bc_[0];
84  for(int i=1; i<=ibc; ++i)
85  bc[i] = bc[i-1] + (ibc+1);
86 
87  for(int i=0; i<=ibc; i++)
88  bc[i][0] = 1.0;
89  for(int i=0; i<=ibc; i++)
90  for(int j=1; j<=i; ++j)
91  bc[i][j] = bc[i][j-1] * Real(i-j+1) / Real(j);
92  }
93 
94  for (int i = 0; i < 128; i++) {
95  twoi1[i] = 1.0 / (Real(2.0) * i + Real(1.0));
96  ihalf[i] = Real(i) - Real(0.5);
97  }
98 
99  }
100 
101  ~ExpensiveNumbers() {
102  }
103 
104  std::vector<Real> fac;
105  std::vector<Real> df;
106  std::vector<Real*> bc;
107 
108  // these quantitites are needed with indices <= mmax
109  // 64 is sufficient to handle up to 4 center integrals with up to L=15 basis functions
110  // but need higher values for Yukawa integrals ...
111  Real twoi1[128]; /* 1/(2 i + 1); needed for downward recursion */
112  Real ihalf[128]; /* i - 0.5, needed for upward recursion */
113 
114  private:
115  std::vector<Real> bc_;
116  };
117 
118 #define _local_min_macro(a,b) ((a) > (b) ? (a) : (b))
119 
138  template<typename Real>
140 
142  static Real eval(Real T, size_t m, Real absolute_precision) {
143  assert(m < 100);
144  static const Real T_crit = std::numeric_limits<Real>::is_bounded == true ? -log( std::numeric_limits<Real>::min() * 100.5 / 2. ) : Real(0) ;
145  if (std::numeric_limits<Real>::is_bounded && T > T_crit)
146  throw std::overflow_error("FmEval_Reference<Real>::eval: Real lacks precision for the given value of argument T");
147  Real denom = (m + 0.5);
148  Real term = 0.5 * exp(-T) / denom;
149  Real old_term = 0.0;
150  Real sum = term;
151  //Real rel_error;
152  Real epsilon;
153  const Real relative_zero = std::numeric_limits<Real>::epsilon();
154  const Real absolute_precision_o_1000 = absolute_precision * 0.001;
155  do {
156  denom += 1.0;
157  old_term = term;
158  term = old_term * T / denom;
159  sum += term;
160  //rel_error = term / sum;
161  // stop if adding a term smaller or equal to absolute_precision/1000 and smaller than relative_zero * sum
162  // When sum is small in absolute value, the second threshold is more important
163  epsilon = _local_min_macro(absolute_precision_o_1000, sum*relative_zero);
164  } while (term > epsilon || old_term < term);
165 
166  return sum;
167  }
168 
174  static void eval(Real* Fm, Real T, size_t mmax, Real absolute_precision) {
175 
176  // evaluate for mmax using MacLaurin series
177  // it converges fastest for the largest m -> use it to compute Fmmax(T)
178  // see JPC 94, 5564 (1990).
179  for(size_t m=0; m<=mmax; ++m)
180  Fm[m] = eval(T, m, absolute_precision);
181  return;
190  }
191 
192  };
193 
204  template<typename Real>
206 
212  static void eval(Real* Fm, Real t, size_t mmax, Real absolute_precision) {
213 
214  if (t < Real(30)) {
215  FmEval_Reference<Real>::eval(Fm,t,mmax,absolute_precision);
216  }
217  else {
218  const Real two_over_sqrt_pi{1.128379167095512573896158903121545171688101258657997713688171443421284936882986828973487320404214727};
219  const Real K = 1.0/two_over_sqrt_pi;
220 
221  auto t2 = 2*t;
222  auto et = exp(-t);
223  auto sqrt_t = sqrt(t);
224  Fm[0] = K*erf(sqrt_t)/sqrt_t;
225  if (mmax > 0)
226  for(size_t m=0; m<=mmax-1; m++) {
227  Fm[m+1] = ((2*m + 1)*Fm[m] - et)/(t2);
228  }
229  }
230  }
231 
232  };
233 
238  template <typename Real = double>
240 
241  static const int NGRID = 4096;
242  static const int INTERPOLATION_ORDER = 4;
243  static const bool INTERPOLATION_AND_RECURSION = false;
244 
245  const Real T_crit;
246  const Real delta;
247  const Real one_over_delta;
248  int mmax;
249  ExpensiveNumbers<double> numbers_;
250  Real *c; /* the Chebyshev coefficients table, NGRID by mmax*interpolation_order */
251 
252  public:
255  FmEval_Chebyshev3(int m_max, double = 0.0) :
256  T_crit(30.0), // this translates in appr. 1e-15 error in upward recursion, see the note below
257  delta(T_crit / (NGRID - 1)),
258  one_over_delta(1.0 / delta),
259  mmax(m_max), numbers_(14) {
260  assert(mmax <= 63);
261  if (m_max >= 0)
262  init();
263  }
264  ~FmEval_Chebyshev3() {
265  if (mmax >= 0) {
266  free(c);
267  }
268  }
269 
270  // some features require at least C++11
271 #if LIBINT2_CPLUSPLUS_STD >= 2011
272  static const std::shared_ptr<FmEval_Chebyshev3>& instance(int m_max, double = 0.0) {
274 
275  // thread-safe per C++11 standard [6.7.4]
276  static auto instance_ = std::shared_ptr<FmEval_Chebyshev3>{};
277 
278  const bool need_new_instance = !instance_ || (instance_ && instance_->max_m() < m_max);
279  if (need_new_instance) {
280  auto new_instance = std::make_shared<FmEval_Chebyshev3>(m_max);
281  instance_ = new_instance; // thread-safe
282  }
283 
284  return instance_;
285  }
286 #endif
287 
289  int max_m() const { return mmax; }
290 
295  inline void eval(Real* Fm, Real x, int m_max) const {
296 
297  // large T => use upward recursion
298  // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls
299  if (x > T_crit) {
300  const double one_over_x = 1.0/x;
301  Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
302  if (m_max == 0)
303  return;
304  // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15
305  for (int i = 1; i <= m_max; i++)
306  Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
307  return;
308  }
309 
310  // ---------------------------------------------
311  // small and intermediate arguments => interpolate Fm and (optional) downward recursion
312  // ---------------------------------------------
313  // about which point on the grid to interpolate?
314  const double xd = x * one_over_delta;
315  const int iv = int(xd); // the interval
316  // INTERPOLATION_AND_RECURSION== true? evaluate by interpolation for LARGEST m only
317  // INTERPOLATION_AND_RECURSION==false? evaluate by interpolation for ALL m
318  const int m_min = INTERPOLATION_AND_RECURSION ? m_max : 0;
319 
320 #if defined(__AVX__) || defined(__SSE2__)
321  const auto x2 = xd*xd;
322  const auto x3 = x2*xd;
323 # if defined (__AVX__)
324  libint2::simd::VectorAVXDouble xvec(1., xd, x2, x3);
325 # else // defined(__SSE2__)
326  libint2::simd::VectorSSEDouble x0vec(1., xd);
327  libint2::simd::VectorSSEDouble x1vec(x2, x3);
328 # endif
329 #endif // SSE2 || AVX
330 
331  const Real *d = c + INTERPOLATION_ORDER * (iv * (mmax+1) + m_min); // ptr to the interpolation data for m=mmin
332  int m = m_min;
333 #if defined(__AVX__)
334  if (m_max-m >=3) {
335  const int unroll_size = 4;
336  const int m_fence = (m_max + 2 - unroll_size);
337  for(; m<m_fence; m+=unroll_size, d+=INTERPOLATION_ORDER*unroll_size) {
338  libint2::simd::VectorAVXDouble d0v, d1v, d2v, d3v;
339  d0v.load_aligned(d);
340  d1v.load_aligned(d+INTERPOLATION_ORDER);
341  d2v.load_aligned(d+2*INTERPOLATION_ORDER);
342  d3v.load_aligned(d+3*INTERPOLATION_ORDER);
343  libint2::simd::VectorAVXDouble fm0 = d0v * xvec;
344  libint2::simd::VectorAVXDouble fm1 = d1v * xvec;
345  libint2::simd::VectorAVXDouble fm2 = d2v * xvec;
346  libint2::simd::VectorAVXDouble fm3 = d3v * xvec;
347  libint2::simd::VectorAVXDouble sum0123 = horizontal_add(fm0, fm1, fm2, fm3);
348  sum0123.convert(&Fm[m]);
349  }
350  } // unroll_size=4
351  if (m_max-m >=1) {
352  const int unroll_size = 2;
353  const int m_fence = (m_max + 2 - unroll_size);
354  for(; m<m_fence; m+=unroll_size, d+=INTERPOLATION_ORDER*unroll_size) {
356  d0v.load_aligned(d);
357  d1v.load_aligned(d+INTERPOLATION_ORDER);
358  libint2::simd::VectorAVXDouble fm0 = d0v * xvec;
359  libint2::simd::VectorAVXDouble fm1 = d1v * xvec;
360  libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1);
361  sum01.convert(&Fm[m]);
362  }
363  } // unroll_size=2
364  { // no unrolling
365  for(; m<=m_max; ++m, d+=INTERPOLATION_ORDER) {
367  dvec.load_aligned(d);
368  Fm[m] = horizontal_add(dvec * xvec);
369  }
370  }
371 #elif defined(__SSE2__)
372  if (m_max-m >=1) {
373  const int unroll_size = 2;
374  const int m_fence = (m_max + 2 - unroll_size);
375  for(; m<m_fence; m+=unroll_size, d+=INTERPOLATION_ORDER*unroll_size) {
376  libint2::simd::VectorSSEDouble d00v, d01v, d10v, d11v;
377  d00v.load_aligned(d);
378  d01v.load_aligned(d+2);
379  d10v.load_aligned(d+4); // d + INTERPOLATION_ORDER
380  d11v.load_aligned(d+6);
381  libint2::simd::VectorSSEDouble fm00 = d00v * x0vec;
382  libint2::simd::VectorSSEDouble fm01 = d01v * x1vec;
383  libint2::simd::VectorSSEDouble fm10 = d10v * x0vec;
384  libint2::simd::VectorSSEDouble fm11 = d11v * x1vec;
385  libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm00, fm10) + horizontal_add(fm01, fm11);
386  sum01.convert(&Fm[m]);
387  }
388  } // unroll_size=2
389  { // no unrolling
390  for(; m<=m_max; ++m, d+=INTERPOLATION_ORDER) {
391  libint2::simd::VectorSSEDouble d0vec, d1vec;
392  d0vec.load_aligned(d);
393  d1vec.load_aligned(d+2);
394  Fm[m] = horizontal_add(d0vec * x0vec + d1vec * x1vec);
395  }
396  }
397 #else // not SSE2 nor AVX available
398  for(int m=m_min; m<=m_max; ++m, d+=INTERPOLATION_ORDER) {
399  Fm[m] = d[0]
400  + xd * (d[1] + xd * (d[2] + xd * d[3]));
401 
402  // // check against the reference value
403  // if (false) {
404  // double refvalue = FmEval_Reference2<double>::eval(x, m, 1e-15); // compute F(T)
405  // if (abs(refvalue - Fm[m]) > 1e-10) {
406  // std::cout << "T = " << x << " m = " << m << " cheb = "
407  // << Fm[m] << " ref = " << refvalue << std::endl;
408  // }
409  // }
410  }
411 #endif
412 
413 
414  // use downward recursion (Eq. (9.8.14) in HJO)
415  // WARNING: do not turn this on ... on modern CPU exp is very slow
416  if (INTERPOLATION_AND_RECURSION && m_max > 0) {
417  const bool INTERPOLATION_AND_RECURSION_is_slow_on_modern_CPU = false;
418  assert(INTERPOLATION_AND_RECURSION_is_slow_on_modern_CPU);
419  const Real x2 = 2.0 * x;
420  const Real exp_x = exp(-x);
421  for (int m = m_max - 1; m >= 0; m--)
422  Fm[m] = (Fm[m + 1] * x2 + exp_x) * numbers_.twoi1[m];
423  }
424  }
425 
426  private:
427 
428  /* ----------------------------------------------------------------------------
429  This function here creates the expansion coefficients for a single interval
430 
431  ON INPUT a,b : the interval boundaries
432  cc : a pointer to the appropriate place in
433  the coefficient table
434  m : the F[m] to generate
435  ON OUTPUT cc : cc[0]-cc[3] hold the coefficients
436  ---------------------------------------------------------------------------- */
437  void MakeCoeffs(double a, double b, Real *cc, int m) {
438  int k, j;
439  double f[128], ac[128], Fm[128];
440  double sum;
441  // characterize the interval
442  double TwoDelta = b - a;
443  double Delta = 0.5 * TwoDelta;
444  double HalfDelta = 0.5 * Delta;
445  double XXX = a + Delta;
446 
447  const double absolute_precision = 1e-100; // compute as precisely as possible
448  FmEval_Reference2<double>::eval(Fm, XXX, m + INTERPOLATION_ORDER + 20,
449  absolute_precision);
450 
451  for (k = 0; k <= INTERPOLATION_ORDER + 20; k++) {
452  if ((k % 2) == 0)
453  f[k] = Fm[k + m];
454  else
455  f[k] = -Fm[k + m];
456  }
457  // calculate the coefficients a
458  double fac;
459  for (j = 0; j < INTERPOLATION_ORDER; j++) {
460  if (j == 0)
461  fac = 1.0;
462  else
463  fac = 2.0 * pow(HalfDelta, (double) j);
464  sum = 0.0;
465  for (k = 0; k < 10; k++)
466  sum += f[j + 2 * k] * pow(HalfDelta, (double) (2 * k)) / numbers_.fac[k]
467  / numbers_.fac[k + j];
468  ac[j] = fac * sum;
469  }
470  // calculate the coefficients c that are Gill's f's
471  double arg = -XXX / Delta;
472  double arg2 = arg * arg;
473  double arg3 = arg2 * arg;
474  auto cc0 = (ac[0] - ac[2]) + (ac[1] - 3.0 * ac[3]) * arg
475  + 2.0 * ac[2] * arg2 + 4.0 * ac[3] * arg3;
476 
477  auto cc1 = (2.0 * ac[1] - 6.0 * ac[3]) + 8.0 * ac[2] * arg
478  + 24.0 * ac[3] * arg2;
479  auto cc2 = 8.0 * ac[2] + 48.0 * ac[3] * arg;
480  auto cc3 = 32.0 * ac[3];
481  cc[0] = cc0;
482  cc[1] = cc1;
483  cc[2] = cc2;
484  cc[3] = cc3;
485  }
486 
487  /* ----------------------------------------------------------------------------
488  This function makes the expansion coefficients for all intervals
489 
490 
491  ON INPUT m : the highest F[m] to generate
492 
493  ON OUTPUT c : the coefficients c[m][i] are generated
494  ---------------------------------------------------------------------------- */
495  void init() {
496  int iv, im;
497 
498  // get memory
499  void* result;
500  if (posix_memalign(&result,
501  4*sizeof(Real),
502  (mmax + 1) * NGRID * INTERPOLATION_ORDER * sizeof(Real)))
503  throw std::bad_alloc();
504  c = static_cast<Real*>(result);
505 
506  // make expansion coefficients for each grid value of T
507  for (iv = 0; iv < NGRID; iv++) {
508  const auto a = iv * delta;
509  const auto b = a + delta;
510 
511  // loop over all m values and make the coefficients
512  for (im = 0; im <= mmax; im++) {
513  MakeCoeffs(a, b, c + (iv * (mmax+1) + im) * INTERPOLATION_ORDER, im);
514  }
515  }
516  }
517 
518  };
519 
523  template <typename Real = double>
525 
526  static const int N = 60;
527  static const int ORDER = 7;
528  static const int ORDERp1 = ORDER+1;
529 
530  const Real T_crit;
531  const Real delta;
532  const Real one_over_delta;
533  int mmax;
534  ExpensiveNumbers<double> numbers_;
535  Real *c; /* the Chebyshev coefficients table, N by mmax*interpolation_order */
536 
537  public:
540  FmEval_Chebyshev7(int m_max, double = 0.0) :
541  T_crit(30.0), // this translates in appr. 1e-15 error in upward recursion, see the note below
542  delta(T_crit / N),
543  one_over_delta(1.0 / delta),
544  mmax(m_max), numbers_(14) {
545  assert(mmax <= 63);
546  if (m_max >= 0)
547  init();
548  }
549  ~FmEval_Chebyshev7() {
550  if (mmax >= 0) {
551  free(c);
552  }
553  }
554 
555  // some features require at least C++11
556 #if LIBINT2_CPLUSPLUS_STD >= 2011
557  static const std::shared_ptr<FmEval_Chebyshev7>& instance(int m_max, double = 0.0) {
559 
560  // thread-safe per C++11 standard [6.7.4]
561  static auto instance_ = std::shared_ptr<FmEval_Chebyshev7>{};
562 
563  const bool need_new_instance = !instance_ || (instance_ && instance_->max_m() < m_max);
564  if (need_new_instance) {
565  auto new_instance = std::make_shared<FmEval_Chebyshev7>(m_max);
566  instance_ = new_instance; // thread-safe
567  }
568 
569  return instance_;
570  }
571 #endif
572 
574  int max_m() const { return mmax; }
575 
580  inline void eval(Real* Fm, Real x, int m_max) const {
581 
582  // large T => use upward recursion
583  // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls
584  if (x > T_crit) {
585  const double one_over_x = 1.0/x;
586  Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
587  if (m_max == 0)
588  return;
589  // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15
590  for (int i = 1; i <= m_max; i++)
591  Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
592  return;
593  }
594 
595  // ---------------------------------------------
596  // small and intermediate arguments => interpolate Fm and (optional) downward recursion
597  // ---------------------------------------------
598  // which interval does this x fall into?
599  const Real x_over_delta = x * one_over_delta;
600  const int iv = int(x_over_delta); // the interval index
601  const Real xd = x_over_delta - (Real)iv - 0.5; // this ranges from -0.5 to 0.5
602  const int m_min = 0;
603 
604 #if defined(__AVX__) || defined(__SSE2__)
605  const auto x2 = xd*xd;
606  const auto x3 = x2*xd;
607  const auto x4 = x2*x2;
608  const auto x5 = x2*x3;
609  const auto x6 = x3*x3;
610  const auto x7 = x3*x4;
611 # if defined (__AVX__)
612  libint2::simd::VectorAVXDouble x0vec(1., xd, x2, x3);
613  libint2::simd::VectorAVXDouble x1vec(x4, x5, x6, x7);
614 # else // defined(__SSE2__)
615  libint2::simd::VectorSSEDouble x0vec(1., xd);
616  libint2::simd::VectorSSEDouble x1vec(x2, x3);
617  libint2::simd::VectorSSEDouble x2vec(x4, x5);
618  libint2::simd::VectorSSEDouble x3vec(x6, x7);
619 # endif
620 #endif // SSE2 || AVX
621 
622  const Real *d = c + (ORDERp1) * (iv * (mmax+1) + m_min); // ptr to the interpolation data for m=mmin
623  int m = m_min;
624 #if defined(__AVX__)
625  if (m_max-m >=3) {
626  const int unroll_size = 4;
627  const int m_fence = (m_max + 2 - unroll_size);
628  for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
629  libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v,
630  d20v, d21v, d30v, d31v;
631  d00v.load_aligned(d); d01v.load_aligned((d+4));
632  d10v.load_aligned(d+ORDERp1); d11v.load_aligned((d+4)+ORDERp1);
633  d20v.load_aligned(d+2*ORDERp1); d21v.load_aligned((d+4)+2*ORDERp1);
634  d30v.load_aligned(d+3*ORDERp1); d31v.load_aligned((d+4)+3*ORDERp1);
635  libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
636  libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
637  libint2::simd::VectorAVXDouble fm2 = d20v * x0vec + d21v * x1vec;
638  libint2::simd::VectorAVXDouble fm3 = d30v * x0vec + d31v * x1vec;
639  libint2::simd::VectorAVXDouble sum0123 = horizontal_add(fm0, fm1, fm2, fm3);
640  sum0123.convert(&Fm[m]);
641  }
642  } // unroll_size=4
643  if (m_max-m >=1) {
644  const int unroll_size = 2;
645  const int m_fence = (m_max + 2 - unroll_size);
646  for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
647  libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v;
648  d00v.load_aligned(d);
649  d01v.load_aligned((d+4));
650  d10v.load_aligned(d+ORDERp1);
651  d11v.load_aligned((d+4)+ORDERp1);
652  libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
653  libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
654  libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1);
655  sum01.convert(&Fm[m]);
656  }
657  } // unroll_size=2
658  { // no unrolling
659  for(; m<=m_max; ++m, d+=ORDERp1) {
661  d0v.load_aligned(d);
662  d1v.load_aligned(d+4);
663  Fm[m] = horizontal_add(d0v * x0vec + d1v * x1vec);
664  }
665  }
666 #elif defined(__SSE2__) && 0 // no SSE yet
667  if (m_max-m >=1) {
668  const int unroll_size = 2;
669  const int m_fence = (m_max + 2 - unroll_size);
670  for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
671  libint2::simd::VectorSSEDouble d00v, d01v, d10v, d11v;
672  d00v.load_aligned(d);
673  d01v.load_aligned(d+2);
674  d10v.load_aligned(d+4); // d + ORDERp1
675  d11v.load_aligned(d+6);
676  libint2::simd::VectorSSEDouble fm00 = d00v * x0vec;
677  libint2::simd::VectorSSEDouble fm01 = d01v * x1vec;
678  libint2::simd::VectorSSEDouble fm10 = d10v * x0vec;
679  libint2::simd::VectorSSEDouble fm11 = d11v * x1vec;
680  libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm00, fm10) + horizontal_add(fm01, fm11);
681  sum01.convert(&Fm[m]);
682  }
683  } // unroll_size=2
684  { // no unrolling
685  for(; m<=m_max; ++m, d+=ORDERp1) {
686  libint2::simd::VectorSSEDouble d0vec, d1vec;
687  d0vec.load_aligned(d);
688  d1vec.load_aligned(d+2);
689  Fm[m] = horizontal_add(d0vec * x0vec + d1vec * x1vec);
690  }
691  }
692 #else // not SSE2 nor AVX available
693  for(int m=m_min; m<=m_max; ++m, d+=ORDERp1) {
694  Fm[m] = d[0]
695  + xd * (d[1]
696  + xd * (d[2]
697  + xd * (d[3]
698  + xd * (d[4]
699  + xd * (d[5]
700  + xd * (d[6]
701  + xd * (d[7])))))));
702 
703  // // check against the reference value
704  // if (false) {
705  // double refvalue = FmEval_Reference2<double>::eval(x, m, 1e-15); // compute F(T)
706  // if (abs(refvalue - Fm[m]) > 1e-10) {
707  // std::cout << "T = " << x << " m = " << m << " cheb = "
708  // << Fm[m] << " ref = " << refvalue << std::endl;
709  // }
710  // }
711  }
712 #endif
713 
714 
715  } // eval()
716 
717  private:
718 
719  void init() {
720 
721 #include <libint2/boys_cheb7.h>
722 
723  assert(mmax <= cheb_table_mmax);
724  // get memory
725  void* result;
726  posix_memalign(&result, ORDERp1*sizeof(Real), (mmax + 1) * N * ORDERp1 * sizeof(Real));
727  c = static_cast<Real*>(result);
728 
729  // copy contents of static table into c
730  // need all intervals
731  for (int iv = 0; iv < N; ++iv) {
732  // but only values of m up to mmax
733  std::copy(cheb_table[iv], cheb_table[iv]+(mmax+1)*ORDERp1, c+(iv*(mmax+1))*ORDERp1);
734  }
735  }
736 
737  }; // FmEval_Chebyshev7
738 
739 #ifndef STATIC_OON
740 #define STATIC_OON
741  namespace {
742  const double oon[] = {0.0, 1.0, 1.0/2.0, 1.0/3.0, 1.0/4.0, 1.0/5.0, 1.0/6.0, 1.0/7.0, 1.0/8.0, 1.0/9.0, 1.0/10.0, 1.0/11.0};
743  }
744 #endif
745 
751  template<typename Real = double, int INTERPOLATION_ORDER = 7>
753  public:
754  static const int max_interp_order = 8;
755  static const bool INTERPOLATION_AND_RECURSION = false; // compute F_lmax(T) and then iterate down to F_0(T)? Else use interpolation only
756  const double soft_zero_;
757 
759  FmEval_Taylor(unsigned int mmax, Real precision) :
760  soft_zero_(1e-6), cutoff_(precision), numbers_(
761  INTERPOLATION_ORDER + 1, 2 * (mmax + INTERPOLATION_ORDER - 1)) {
762 
763  assert(mmax <= 63);
764 
765  const double sqrt_pi = std::sqrt(M_PI);
766 
767  /*---------------------------------------
768  We are doing Taylor interpolation with
769  n=TAYLOR_ORDER terms here:
770  error <= delT^n/(n+1)!
771  ---------------------------------------*/
772  delT_ = 2.0
773  * std::pow(cutoff_ * numbers_.fac[INTERPOLATION_ORDER + 1],
774  1.0 / INTERPOLATION_ORDER);
775  oodelT_ = 1.0 / delT_;
776  max_m_ = mmax + INTERPOLATION_ORDER - 1;
777 
778  T_crit_ = new Real[max_m_ + 1]; /*--- m=0 is included! ---*/
779  max_T_ = 0;
780  /*--- Figure out T_crit for each m and put into the T_crit ---*/
781  for (int m = max_m_; m >= 0; --m) {
782  /*------------------------------------------
783  Damped Newton-Raphson method to solve
784  T^{m-0.5}*exp(-T) = epsilon*Gamma(m+0.5)
785  The solution is the max T for which to do
786  the interpolation
787  ------------------------------------------*/
788  double T = -log(cutoff_);
789  const double egamma = cutoff_ * sqrt_pi * numbers_.df[2 * m]
790  / std::pow(2.0, m);
791  double T_new = T;
792  double func;
793  do {
794  const double damping_factor = 0.2;
795  T = T_new;
796  /* f(T) = the difference between LHS and RHS of the equation above */
797  func = std::pow(T, m - 0.5) * std::exp(-T) - egamma;
798  const double dfuncdT = ((m - 0.5) * std::pow(T, m - 1.5)
799  - std::pow(T, m - 0.5)) * std::exp(-T);
800  /* f(T) has 2 roots and has a maximum in between. If f'(T) > 0 we are to the left of the hump. Make a big step to the right. */
801  if (dfuncdT > 0.0) {
802  T_new *= 2.0;
803  } else {
804  /* damp the step */
805  double deltaT = -func / dfuncdT;
806  const double sign_deltaT = (deltaT > 0.0) ? 1.0 : -1.0;
807  const double max_deltaT = damping_factor * T;
808  if (std::fabs(deltaT) > max_deltaT)
809  deltaT = sign_deltaT * max_deltaT;
810  T_new = T + deltaT;
811  }
812  if (T_new <= 0.0) {
813  T_new = T / 2.0;
814  }
815  } while (std::fabs(func / egamma) >= soft_zero_);
816  T_crit_[m] = T_new;
817  const int T_idx = (int) std::floor(T_new / delT_);
818  max_T_ = std::max(max_T_, T_idx);
819  }
820 
821  // allocate the grid (see the comments below)
822  {
823  const int nrow = max_T_ + 1;
824  const int ncol = max_m_ + 1;
825  grid_ = new Real*[nrow];
826  grid_[0] = new Real[nrow * ncol];
827  //std::cout << "Allocated interpolation table of " << nrow * ncol << " reals" << std::endl;
828  for (int r = 1; r < nrow; ++r)
829  grid_[r] = grid_[r - 1] + ncol;
830  }
831 
832  /*-------------------------------------------------------
833  Tabulate the gamma function from t=delT to T_crit[m]:
834  1) include T=0 though the table is empty for T=0 since
835  Fm(0) is simple to compute
836  -------------------------------------------------------*/
837  /*--- do the mmax first ---*/
838  for (int T_idx = max_T_; T_idx >= 0; --T_idx) {
839  const double T = T_idx * delT_;
840  libint2::FmEval_Reference2<double>::eval(grid_[T_idx], T, max_m_, 1e-100);
841  }
842  }
843 
844  ~FmEval_Taylor() {
845  delete[] T_crit_;
846  delete[] grid_[0];
847  delete[] grid_;
848  }
849 
850  // some features require at least C++11
851 #if LIBINT2_CPLUSPLUS_STD >= 2011
852  static const std::shared_ptr<FmEval_Taylor>& instance(unsigned int mmax, Real precision) {
855 
856  // thread-safe per C++11 standard [6.7.4]
857  static auto instance_ = std::shared_ptr<FmEval_Taylor>{};
858 
859  const bool need_new_instance = !instance_ ||
860  (instance_ && (instance_->max_m() < mmax ||
861  instance_->precision() > precision));
862  if (need_new_instance) {
863  auto new_instance = std::make_shared<FmEval_Taylor>(mmax, precision);
864  instance_ = new_instance; // thread-safe
865  }
866 
867  return instance_;
868  }
869 #endif
870 
872  int max_m() const { return max_m_ - INTERPOLATION_ORDER + 1; }
874  Real precision() const { return cutoff_; }
875 
881  void eval(Real* Fm, Real T, int mmax) const {
882  const double sqrt_pio2 = 1.2533141373155002512;
883  const double two_T = 2.0 * T;
884 
885  // stop recursion at mmin
886  const int mmin = INTERPOLATION_AND_RECURSION ? mmax : 0;
887  /*-------------------------------------
888  Compute Fm(T) from mmax down to mmin
889  -------------------------------------*/
890  const bool use_upward_recursion = true;
891  if (use_upward_recursion) {
892 // if (T > 30.0) {
893  if (T > T_crit_[0]) {
894  const double one_over_x = 1.0/T;
895  Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
896  if (mmax == 0)
897  return;
898  // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15
899  for (int i = 1; i <= mmax; i++)
900  Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
901  return;
902  }
903  }
904 
905  // since Tcrit grows with mmax, this condition only needs to be determined once
906  if (T > T_crit_[mmax]) {
907  double pow_two_T_to_minusjp05 = std::pow(two_T, -mmax - 0.5);
908  for (int m = mmax; m >= mmin; --m) {
909  /*--- Asymptotic formula ---*/
910  Fm[m] = numbers_.df[2 * m] * sqrt_pio2 * pow_two_T_to_minusjp05;
911  pow_two_T_to_minusjp05 *= two_T;
912  }
913  }
914  else
915  {
916  const int T_ind = (int) (0.5 + T * oodelT_);
917  const Real h = T_ind * delT_ - T;
918  const Real* F_row = grid_[T_ind] + mmin;
919 
920 #if defined (__AVX__)
921  libint2::simd::VectorAVXDouble h0123, h4567;
922  if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
923  const double h2 = h*h*oon[2];
924  const double h3 = h2*h*oon[3];
925  h0123 = libint2::simd::VectorAVXDouble (1.0, h, h2, h3);
926  if (INTERPOLATION_ORDER == 7) {
927  const double h4 = h3*h*oon[4];
928  const double h5 = h4*h*oon[5];
929  const double h6 = h5*h*oon[6];
930  const double h7 = h6*h*oon[7];
931  h4567 = libint2::simd::VectorAVXDouble (h4, h5, h6, h7);
932  }
933  }
934  // libint2::simd::VectorAVXDouble h0123(1.0);
935  // libint2::simd::VectorAVXDouble h4567(1.0);
936 #endif
937 
938  int m = mmin;
939  if (mmax-m >=1) {
940  const int unroll_size = 2;
941  const int m_fence = (mmax + 2 - unroll_size);
942  for(; m<m_fence; m+=unroll_size, F_row+=unroll_size) {
943 
944 #if defined(__AVX__)
945  if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
946  libint2::simd::VectorAVXDouble fr0_0123; fr0_0123.load(F_row);
947  libint2::simd::VectorAVXDouble fr1_0123; fr1_0123.load(F_row+1);
948  libint2::simd::VectorSSEDouble fm01 = horizontal_add(fr0_0123*h0123, fr1_0123*h0123);
949  if (INTERPOLATION_ORDER == 7) {
950  libint2::simd::VectorAVXDouble fr0_4567; fr0_4567.load(F_row+4);
951  libint2::simd::VectorAVXDouble fr1_4567; fr1_4567.load(F_row+5);
952  fm01 += horizontal_add(fr0_4567*h4567, fr1_4567*h4567);
953  }
954  fm01.convert(&Fm[m]);
955  }
956  else {
957 #endif
958  Real total0 = 0.0, total1 = 0.0;
959  for(int i=INTERPOLATION_ORDER; i>=1; --i) {
960  total0 = oon[i]*h*(F_row[i] + total0);
961  total1 = oon[i]*h*(F_row[i+1] + total1);
962  }
963  Fm[m] = F_row[0] + total0;
964  Fm[m+1] = F_row[1] + total1;
965 #if defined(__AVX__)
966  }
967 #endif
968  }
969  } // unroll_size = 2
970  if (m<=mmax) { // unroll_size = 1
971 #if defined(__AVX__)
972  if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
973  libint2::simd::VectorAVXDouble fr0123; fr0123.load(F_row);
974  if (INTERPOLATION_ORDER == 7) {
975  libint2::simd::VectorAVXDouble fr4567; fr4567.load(F_row+4);
976 // libint2::simd::VectorSSEDouble fm = horizontal_add(fr0123*h0123, fr4567*h4567);
977 // Fm[m] = horizontal_add(fm);
978  Fm[m] = horizontal_add(fr0123*h0123 + fr4567*h4567);
979  }
980  else { // INTERPOLATION_ORDER == 3
981  Fm[m] = horizontal_add(fr0123*h0123);
982  }
983  }
984  else {
985 #endif
986  Real total = 0.0;
987  for(int i=INTERPOLATION_ORDER; i>=1; --i) {
988  total = oon[i]*h*(F_row[i] + total);
989  }
990  Fm[m] = F_row[0] + total;
991 #if defined(__AVX__)
992  }
993 #endif
994  } // unroll_size = 1
995 
996  // check against the reference value
997 // if (false) {
998 // double refvalue = FmEval_Reference2<double>::eval(T, mmax, 1e-15); // compute F(T) with m=mmax
999 // if (abs(refvalue - Fm[mmax]) > 1e-14) {
1000 // std::cout << "T = " << T << " m = " << mmax << " cheb = "
1001 // << Fm[mmax] << " ref = " << refvalue << std::endl;
1002 // }
1003 // }
1004 
1005  } // if T < T_crit
1006 
1007  /*------------------------------------
1008  And then do downward recursion in j
1009  ------------------------------------*/
1010  if (INTERPOLATION_AND_RECURSION && mmin > 0) {
1011  const Real exp_mT = std::exp(-T);
1012  for (int m = mmin - 1; m >= 0; --m) {
1013  Fm[m] = (exp_mT + two_T * Fm[m+1]) * numbers_.twoi1[m];
1014  }
1015  }
1016  }
1017 
1018  private:
1019  Real **grid_; /* Table of "exact" Fm(T) values. Row index corresponds to
1020  values of T (max_T+1 rows), column index to values
1021  of m (max_m+1 columns) */
1022  Real delT_; /* The step size for T, depends on cutoff */
1023  Real oodelT_; /* 1.0 / delT_, see above */
1024  Real cutoff_; /* Tolerance cutoff used in all computations of Fm(T) */
1025  int max_m_; /* Maximum value of m in the table, depends on cutoff
1026  and the number of terms in Taylor interpolation */
1027  int max_T_; /* Maximum index of T in the table, depends on cutoff
1028  and m */
1029  Real *T_crit_; /* Maximum T for each row, depends on cutoff;
1030  for a given m and T_idx <= max_T_idx[m] use Taylor interpolation,
1031  for a given m and T_idx > max_T_idx[m] use the asymptotic formula */
1032 
1033  ExpensiveNumbers<double> numbers_;
1034 
1043  static double truncation_error(unsigned int m, double T) {
1044  const double m2= m * m;
1045  const double m3= m2 * m;
1046  const double m4= m2 * m2;
1047  const double T2= T * T;
1048  const double T3= T2 * T;
1049  const double T4= T2 * T2;
1050  const double T5= T2 * T3;
1051 
1052  const double result = exp(-T) * (105 + 16*m4 + 16*m3*(T - 8) - 30*T + 12*T2
1053  - 8*T3 + 16*T4 + 8*m2*(43 - 9*T + 2*T2) +
1054  4*m*(-88 + 23*T - 8*T2 + 4*T3))/(32*T5);
1055  return result;
1056  }
1065  static double truncation_error(double T) {
1066  const double result = exp(-T) /(2*T);
1067  return result;
1068  }
1069  };
1070 
1071 
1075 
1076 #if 0
1077 
1081  template<typename Real>
1082  struct YukawaGmEval {
1083 
1084  static const int mmin = -1;
1085 
1087  YukawaGmEval(unsigned int mmax, Real precision) :
1088  mmax_(mmax), precision_(precision),
1089  numbers_(),
1090  Gm_0_U_(256) // should be enough to hold up to G_{255}(0,U)
1091  { }
1092 
1093  unsigned int max_m() const { return mmax; }
1095  Real precision() const { return precision_; }
1096 
1098  void eval_yukawa(Real* Gm, Real T, Real U, size_t mmax, Real absolute_precision) {
1099  assert(false); // not yet implemented
1100  }
1102  void eval_slater(Real* Gm, Real T, Real U, size_t mmax, Real absolute_precision) {
1103  assert(false); // not yet implemented
1104  }
1105 
1109  static void eval_yukawa_s1(Real* Gm, Real T, Real U, size_t mmax) {
1110  Real G_m1;
1111 
1112  const Real sqrt_U = sqrt(U);
1113  const Real sqrt_T = sqrt(T);
1114  const Real oo_sqrt_T = 1 / sqrt_T;
1115  const Real oo_sqrt_U = 1 / sqrt_U;
1116  const Real exp_mT = exp(-T);
1117  const Real kappa = sqrt_U - sqrt_T;
1118  const Real lambda = sqrt_U + sqrt_T;
1119  const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1120  const Real pfac = sqrtPi_over_4 * exp_mT;
1121  const Real erfc_k = exp(kappa*kappa) * (1 - erf(kappa));
1122  const Real erfc_l = exp(lambda*lambda) * (1 - erf(lambda));
1123 
1124  Gm[0] = pfac * (erfc_k + erfc_l) * oo_sqrt_U;
1125  Gm[1] = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
1126  if (mmax > 0) {
1127 
1128  // first application of URR
1129  const Real oo_two_T = 0.5 / T;
1130  const Real two_U = 2.0 * U;
1131 
1132  for(unsigned int m=1, two_m_minus_1=1; m<=mmax; ++m, two_m_minus_1+=2) {
1133  Gm[m+1] = oo_two_T * ( two_m_minus_1 * Gm[m] + two_U * Gm[m-1] - exp_mT);
1134  }
1135  }
1136 
1137  return;
1138  }
1139 
1146  void eval_yukawa_s2(Real* Gm, Real T, Real U, size_t mmax) {
1147 
1148  // TODO estimate the number of expansion terms for the given precision
1149  const int expansion_order = 60;
1150  eval_yukawa_Gm0U(Gm_0_U_, U, mmax - 1 + expansion_order);
1151 
1152  // Maclaurin
1153 
1154 
1155  // downward recursion
1156  //Gm[m + 1] = 1/(2 U) (E^-T - (2 m + 3) Gm[[m + 2]] + 2 T Gm[[m + 3]])
1157  const Real one_over_twoU = 0.5 / U;
1158  const Real one_over_twoU = 2.0 * T;
1159  const Real exp_mT = exp(-T);
1160  for(int m=mmax-2; m>=-1; --m)
1161  Gm[m] = one_over_twoU (exp_mT - numbers_.twoi1[m+1] * Gm[m+1] + twoT Gm[m+2])
1162 
1163  // testing ...
1164  std::copy(Gm_0_U_.begin()+1, Gm_0_U_.begin()+mmax+2, Gm);
1165 
1166  return;
1167  }
1168 
1174  void eval_yukawa_s3(Real* Gm, Real T, Real U, size_t mmax) {
1175 
1176  // Ten-no's prescription:
1177  //
1178 
1179  assert(false);
1180 
1181  // testing ...
1182  std::copy(Gm_0_U_.begin()+1, Gm_0_U_.begin()+mmax+2, Gm);
1183 
1184  return;
1185  }
1186 
1187 
1197  void eval_yukawa_Gm0U(Real* Gm0U, Real U, int mmax, int mmin = -1) {
1198 
1199  // Ten-no's prescription:
1200  // start with Gm*(0,T)
1201  // 1) for U < 5, m* = -1
1202  // 2) for U > 5, m* = min(U,mmax)
1203  int mstar;
1204 
1205  // G_{-1} (0,U) is easy
1206  if (U < 5.0) {
1207  mstar = -1;
1208 
1209  const Real sqrt_U = sqrt(U);
1210  const Real exp_U = exp(U);
1211  const Real oo_sqrt_U = 1 / sqrt_U;
1212  const Real sqrtPi_over_2(
1213  0.88622692545275801364908374167057259139877472806119);
1214  const Real pfac = sqrtPi_over_2 * exp_U;
1215  const Real erfc_sqrt_U = 1.0 - erf(sqrt_U);
1216  Gm_0_U_[0] = pfac * exp_U * oo_sqrt_U * erfc_sqrt_U;
1217  // can get G0 for "free"
1218  // this is the l'Hopital-transformed expression for G_0 (0,T)
1219 // const Real sqrtPi(
1220 // 1.7724538509055160272981674833411451827975494561224);
1221 // Gm_0_U_[1] = 1.0 - exp_U * sqrtPi * sqrt_U * erfc_sqrt_U;
1222  }
1223  else { // use continued fraction for m*
1224  mstar = std::min((size_t)U,(size_t)mmax);
1225  const bool implemented = false;
1226  assert(implemented == true);
1227  }
1228 
1229  { // use recursion if needed
1230  const Real two_U = 2.0 * U;
1231  // simplified URR
1232  if (mmax > mstar) {
1233  for(int m=mstar+1; m<=mmax; ++m) {
1234  Gm_0_U_[m+1] = numbers_.twoi1[m] * (1.0 - two_U * Gm_0_U_[m]);
1235  }
1236  }
1237 
1238  // simplified DRR
1239  if (mstar > mmin) { // instead of -1 because we trigger this only for U > 5
1240  const Real one_over_U = 2.0 / two_U;
1241  for(int m=mstar-1; m>=mmin; --m) {
1242  Gm_0_U_[m+1] = one_over_U * ( 0.5 - numbers_.ihalf[m+2] * Gm_0_U_[m+2]);
1243  }
1244  }
1245  }
1246 
1247  // testing ...
1248  std::copy(Gm_0_U_.begin()+1, Gm_0_U_.begin()+mmax+2, Gm0U);
1249 
1250  return;
1251  }
1252 
1253 
1255  static Real eval_Gm1(Real T, Real U) {
1256  const Real sqrt_U = sqrt(U);
1257  const Real sqrt_T = sqrt(T);
1258  const Real exp_mT = exp(-T);
1259  const Real kappa = sqrt_U - sqrt_T;
1260  const Real lambda = sqrt_U + sqrt_T;
1261  const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1262  const Real result = sqrtPi_over_4 * exp_mT *
1263  (exp(kappa*kappa) * (1 - erf(kappa)) + exp(lambda*lambda) * (1 - erf(lambda))) / sqrt_U;
1264  return result;
1265  }
1267  static Real eval_G0(Real T, Real U) {
1268  const Real sqrt_U = sqrt(U);
1269  const Real sqrt_T = sqrt(T);
1270  const Real exp_mT = exp(-T);
1271  const Real kappa = sqrt_U - sqrt_T;
1272  const Real lambda = sqrt_U + sqrt_T;
1273  const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1274  const Real result = sqrtPi_over_4 * exp_mT *
1275  (exp(kappa*kappa) * (1 - erf(kappa)) - exp(lambda*lambda) * (1 - erf(lambda))) / sqrt_T;
1276  return result;
1277  }
1280  static void eval_G_m1_0(Real* result, Real T, Real U) {
1281  const Real sqrt_U = sqrt(U);
1282  const Real sqrt_T = sqrt(T);
1283  const Real oo_sqrt_U = 1 / sqrt_U;
1284  const Real oo_sqrt_T = 1 / sqrt_T;
1285  const Real exp_mT = exp(-T);
1286  const Real kappa = sqrt_U - sqrt_T;
1287  const Real lambda = sqrt_U + sqrt_T;
1288  const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1289  const Real pfac = sqrtPi_over_4 * exp_mT;
1290  const Real erfc_k = exp(kappa*kappa) * (1 - erf(kappa));
1291  const Real erfc_l = exp(lambda*lambda) * (1 - erf(lambda));
1292  result[0] = pfac * (erfc_k + erfc_l) * oo_sqrt_U;
1293  result[1] = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
1294  }
1295 
1297  static Real eval_MacLaurinT(Real T, Real U, size_t m, Real absolute_precision) {
1298  assert(false); // not yet implemented
1299  return 0.0;
1300  }
1301 
1302  private:
1303  std::vector<Real> Gm_0_U_; // used for MacLaurin expansion
1304  unsigned int mmax_;
1305  Real precision_;
1306  ExpensiveNumbers<Real> numbers_;
1307 
1308  // since evaluation may involve several functions, will store some intermediate constants here
1309  // to avoid the cost of extra parameters
1310  //Real exp_U_;
1311  //Real exp_mT_;
1312 
1313  size_t count_tenno_algorithm_branches[3]; // counts the number of times each branch Ten-no algorithm
1314  // was picked
1315 
1316  };
1317 #endif
1318 
1319  template<typename Real, int k>
1321 
1322  namespace detail {
1323 
1325  template <typename CoreEval> struct CoreEvalScratch {
1326  CoreEvalScratch() = default;
1327  CoreEvalScratch(int) { }
1328  };
1330  template <typename Real>
1331  struct CoreEvalScratch<GaussianGmEval<Real, -1>> {
1332  std::vector<Real> Fm_;
1333  std::vector<Real> g_i;
1334  std::vector<Real> r_i;
1335  std::vector<Real> oorhog_i;
1336  CoreEvalScratch() = default;
1337  CoreEvalScratch(int mmax) {
1338  init(mmax);
1339  }
1340  private:
1341  void init(int mmax) {
1342  Fm_.resize(mmax+1);
1343  g_i.resize(mmax+1);
1344  r_i.resize(mmax+1);
1345  oorhog_i.resize(mmax+1);
1346  g_i[0] = 1.0;
1347  r_i[0] = 1.0;
1348  }
1349  };
1350  } // namespace libint2::detail
1351 
1355 
1377  template<typename Real, int k>
1378  struct GaussianGmEval : private detail::CoreEvalScratch<GaussianGmEval<Real,k>> // N.B. empty-base optimization
1379  {
1380 
1384  GaussianGmEval(int mmax, Real precision) :
1386  precision_(precision), fm_eval_(),
1387  numbers_(-1,-1,mmax) {
1388  assert(k == -1 || k == 0 || k == 2);
1389  // for k=-1 need to evaluate the Boys function
1390  if (k == -1) {
1391  fm_eval_ = FmEval_Taylor<Real>::instance(mmax_, precision_);
1392 // fm_eval_ = FmEval_Chebyshev3::instance(mmax_);
1393  }
1394  }
1395 
1396  ~GaussianGmEval() {
1397  }
1398 
1399  // some features require at least C++11
1400 #if LIBINT2_CPLUSPLUS_STD >= 2011
1401  static const std::shared_ptr<GaussianGmEval>& instance(unsigned int mmax, Real precision) {
1404 
1405  // thread-safe per C++11 standard [6.7.4]
1406  static auto instance_ = std::shared_ptr<GaussianGmEval>{};
1407 
1408  const bool need_new_instance = !instance_ ||
1409  (instance_ && (instance_->max_m() < mmax ||
1410  instance_->precision() > precision));
1411  if (need_new_instance) {
1412  auto new_instance = std::make_shared<GaussianGmEval>(mmax, precision);
1413  instance_ = new_instance; // thread-safe
1414  }
1415 
1416  return instance_;
1417  }
1418 #endif
1419 
1421  int max_m() const { return mmax_; }
1423  Real precision() const { return precision_; }
1424 
1439  template <typename AnyReal>
1440  void eval(Real* Gm, Real rho, Real T, size_t mmax,
1441  const std::vector<std::pair<AnyReal, AnyReal> >& geminal,
1442  void* scr = 0) {
1443 
1444  std::fill(Gm, Gm+mmax+1, Real(0));
1445 
1446  const double sqrt_rho = sqrt(rho);
1447  const double oo_sqrt_rho = 1.0/sqrt_rho;
1448  if (k == -1) {
1449  void* _scr = (scr == 0) ? this : scr;
1450  auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1451  for(int i=1; i<=mmax; i++) {
1452  scratch.r_i[i] = scratch.r_i[i-1] * rho;
1453  }
1454  }
1455 
1456  typedef typename std::vector<std::pair<AnyReal, AnyReal> >::const_iterator citer;
1457  const citer gend = geminal.end();
1458  for(citer i=geminal.begin(); i!= gend; ++i) {
1459 
1460  const double gamma = i->first;
1461  const double gcoef = i->second;
1462  const double rhog = rho + gamma;
1463  const double oorhog = 1.0/rhog;
1464 
1465  const double gorg = gamma * oorhog;
1466  const double rorg = rho * oorhog;
1467  const double sqrt_rho_org = sqrt_rho * oorhog;
1468  const double sqrt_rhog = sqrt(rhog);
1469  const double sqrt_rorg = sqrt_rho_org * sqrt_rhog;
1470 
1472  const Real const_SQRTPI_2(0.88622692545275801364908374167057259139877472806119); /* sqrt(pi)/2 */
1473  const double SS_K0G12_SS = gcoef * oo_sqrt_rho * const_SQRTPI_2 * rorg * sqrt_rorg * exp(-gorg*T);
1474 
1475  if (k == -1) {
1476  void* _scr = (scr == 0) ? this : scr;
1477  auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1478 
1479  const double rorgT = rorg * T;
1480  fm_eval_->eval(&scratch.Fm_[0], rorgT, mmax);
1481 
1482 #if 1
1483  const Real const_2_SQRTPI(1.12837916709551257389615890312154517); /* 2/sqrt(pi) */
1484  const Real pfac = const_2_SQRTPI * sqrt_rhog * SS_K0G12_SS;
1485  scratch.oorhog_i[0] = pfac;
1486  for(int i=1; i<=mmax; i++) {
1487  scratch.g_i[i] = scratch.g_i[i-1] * gamma;
1488  scratch.oorhog_i[i] = scratch.oorhog_i[i-1] * oorhog;
1489  }
1490  for(int m=0; m<=mmax; m++) {
1491  Real ssss = 0.0;
1492  Real* bcm = numbers_.bc[m];
1493  for(int n=0; n<=m; n++) {
1494  ssss += bcm[n] * scratch.r_i[n] * scratch.g_i[m-n] * scratch.Fm_[n];
1495  }
1496  Gm[m] += ssss * scratch.oorhog_i[m];
1497  }
1498 #endif
1499  }
1500 
1501  if (k == 0) {
1502  double ss_oper_ss_m = SS_K0G12_SS;
1503  Gm[0] += ss_oper_ss_m;
1504  for(int m=1; m<=mmax; ++m) {
1505  ss_oper_ss_m *= gorg;
1506  Gm[m] += ss_oper_ss_m;
1507  }
1508  }
1509 
1510  if (k == 2) {
1511 
1513  const double rorgT = rorg * T;
1514  const double SS_K2G12_SS_0 = (1.5 + rorgT) * (SS_K0G12_SS * oorhog);
1515  const double SS_K2G12_SS_m1 = rorg * (SS_K0G12_SS * oorhog);
1516 
1517  double SS_K2G12_SS_gorg_m = SS_K2G12_SS_0 ;
1518  double SS_K2G12_SS_gorg_m1 = SS_K2G12_SS_m1;
1519  Gm[0] += SS_K2G12_SS_gorg_m;
1520  for(int m=1; m<=mmax; ++m) {
1521  SS_K2G12_SS_gorg_m *= gorg;
1522  Gm[m] += SS_K2G12_SS_gorg_m - m * SS_K2G12_SS_gorg_m1;
1523  SS_K2G12_SS_gorg_m1 *= gorg;
1524  }
1525  }
1526 
1527  }
1528 
1529  }
1530 
1531  private:
1532  int mmax_;
1533  Real precision_; //< absolute precision
1534  std::shared_ptr<FmEval_Taylor<Real>> fm_eval_;
1535 
1536  ExpensiveNumbers<Real> numbers_;
1537  };
1538 
1539  template <typename GmEvalFunction>
1540  struct GenericGmEval {
1541 
1542  GenericGmEval(unsigned int mmax, double precision) : mmax_(mmax), precision_(precision) {}
1543 
1544  static std::shared_ptr<GenericGmEval> instance(unsigned int mmax, double precision = 0.0) {
1545  return std::make_shared<GenericGmEval>(mmax, precision);
1546  }
1547 
1548  template <typename Real, typename... ExtraArgs>
1549  void eval(Real* Gm, Real rho, Real T, size_t mmax, ExtraArgs... args) {
1550  assert(mmax <= mmax_);
1551  GmEvalFunction eval_;
1552  eval_(Gm, rho, T, mmax, std::forward(args)...);
1553  }
1554 
1555  private:
1556  unsigned int mmax_;
1557  double precision_;
1558  };
1559 
1560  /*
1561  * Slater geminal fitting is available only if have LAPACK
1562  */
1563 #if HAVE_LAPACK
1564  /*
1565  f[x_] := - Exp[-\[Zeta] x] / \[Zeta];
1566 
1567  ff[cc_, aa_, x_] := Sum[cc[[i]]*Exp[-aa[[i]] x^2], {i, 1, n}];
1568  */
1569  template <typename Real>
1570  Real
1571  fstg(Real zeta,
1572  Real x) {
1573  return -std::exp(-zeta*x)/zeta;
1574  }
1575 
1576  template <typename Real>
1577  Real
1578  fngtg(const std::vector<Real>& cc,
1579  const std::vector<Real>& aa,
1580  Real x) {
1581  Real value = 0.0;
1582  const Real x2 = x * x;
1583  const unsigned int n = cc.size();
1584  for(unsigned int i=0; i<n; ++i)
1585  value += cc[i] * std::exp(- aa[i] * x2);
1586  return value;
1587  }
1588 
1589  // --- weighting functions ---
1590  // L2 error is weighted by ww(x)
1591  // hence error is weighted by sqrt(ww(x))
1592  template <typename Real>
1593  Real
1594  wwtewklopper(Real x) {
1595  const Real x2 = x * x;
1596  return x2 * std::exp(-2 * x2);
1597  }
1598  template <typename Real>
1599  Real
1600  wwcusp(Real x) {
1601  const Real x2 = x * x;
1602  const Real x6 = x2 * x2 * x2;
1603  return std::exp(-0.005 * x6);
1604  }
1605  // default is Tew-Klopper
1606  template <typename Real>
1607  Real
1608  ww(Real x) {
1609  //return wwtewklopper(x);
1610  return wwcusp(x);
1611  }
1612 
1613  template <typename Real>
1614  Real
1615  norm(const std::vector<Real>& vec) {
1616  Real value = 0.0;
1617  const unsigned int n = vec.size();
1618  for(unsigned int i=0; i<n; ++i)
1619  value += vec[i] * vec[i];
1620  return value;
1621  }
1622 
1623  template <typename Real>
1624  void LinearSolveDamped(const std::vector<Real>& A,
1625  const std::vector<Real>& b,
1626  Real lambda,
1627  std::vector<Real>& x) {
1628  const size_t n = b.size();
1629  std::vector<Real> Acopy(A);
1630  for(size_t m=0; m<n; ++m) Acopy[m*n + m] *= (1 + lambda);
1631  std::vector<Real> e(b);
1632 
1633  //int info = LAPACKE_dgesv( LAPACK_ROW_MAJOR, n, 1, &Acopy[0], n, &ipiv[0], &e[0], n );
1634  {
1635  std::vector<int> ipiv(n);
1636  int n = b.size();
1637  int one = 1;
1638  int info;
1639  dgesv_(&n, &one, &Acopy[0], &n, &ipiv[0], &e[0], &n, &info);
1640  assert (info == 0);
1641  }
1642 
1643  x = e;
1644  }
1645 
1656  template <typename Real>
1657  void stg_ng_fit(unsigned int n,
1658  Real zeta,
1659  std::vector< std::pair<Real, Real> >& geminal,
1660  Real xmin = 0.0,
1661  Real xmax = 10.0,
1662  unsigned int npts = 1001) {
1663 
1664  // initial guess
1665  std::vector<Real> cc(n, 1.0); // coefficients
1666  std::vector<Real> aa(n); // exponents
1667  for(unsigned int i=0; i<n; ++i)
1668  aa[i] = std::pow(3.0, (i + 2 - (n + 1)/2.0));
1669 
1670  // first rescale cc for ff[x] to match the norm of f[x]
1671  Real ffnormfac = 0.0;
1672  for(unsigned int i=0; i<n; ++i)
1673  for(unsigned int j=0; j<n; ++j)
1674  ffnormfac += cc[i] * cc[j]/std::sqrt(aa[i] + aa[j]);
1675  const Real Nf = std::sqrt(2.0 * zeta) * zeta;
1676  const Real Nff = std::sqrt(2.0) / (std::sqrt(ffnormfac) *
1677  std::sqrt(std::sqrt(M_PI)));
1678  for(unsigned int i=0; i<n; ++i) cc[i] *= -Nff/Nf;
1679 
1680  Real lambda0 = 1000; // damping factor is initially set to 1000, eventually should end up at 0
1681  const Real nu = 3.0; // increase/decrease the damping factor scale it by this
1682  const Real epsilon = 1e-15; // convergence
1683  const unsigned int maxniter = 200;
1684 
1685  // grid points on which we will fit
1686  std::vector<Real> xi(npts);
1687  for(unsigned int i=0; i<npts; ++i) xi[i] = xmin + (xmax - xmin)*i/(npts - 1);
1688 
1689  std::vector<Real> err(npts);
1690 
1691  const size_t nparams = 2*n; // params = expansion coefficients + gaussian exponents
1692  std::vector<Real> J( npts * nparams );
1693  std::vector<Real> delta(nparams);
1694 
1695 // std::cout << "iteration 0" << std::endl;
1696 // for(unsigned int i=0; i<n; ++i)
1697 // std::cout << cc[i] << " " << aa[i] << std::endl;
1698 
1699  Real errnormI;
1700  Real errnormIm1 = 1e3;
1701  bool converged = false;
1702  unsigned int iter = 0;
1703  while (!converged && iter < maxniter) {
1704 // std::cout << "Iteration " << ++iter << ": lambda = " << lambda0/nu << std::endl;
1705 
1706  for(unsigned int i=0; i<npts; ++i) {
1707  const Real x = xi[i];
1708  err[i] = (fstg(zeta, x) - fngtg(cc, aa, x)) * std::sqrt(ww(x));
1709  }
1710  errnormI = norm(err)/std::sqrt((Real)npts);
1711 
1712 // std::cout << "|err|=" << errnormI << std::endl;
1713  converged = std::abs((errnormI - errnormIm1)/errnormIm1) <= epsilon;
1714  if (converged) break;
1715  errnormIm1 = errnormI;
1716 
1717  for(unsigned int i=0; i<npts; ++i) {
1718  const Real x2 = xi[i] * xi[i];
1719  const Real sqrt_ww_x = std::sqrt(ww(xi[i]));
1720  const unsigned int ioffset = i * nparams;
1721  for(unsigned int j=0; j<n; ++j)
1722  J[ioffset+j] = (std::exp(-aa[j] * x2)) * sqrt_ww_x;
1723  const unsigned int ioffsetn = ioffset+n;
1724  for(unsigned int j=0; j<n; ++j)
1725  J[ioffsetn+j] = - sqrt_ww_x * x2 * cc[j] * std::exp(-aa[j] * x2);
1726  }
1727 
1728  std::vector<Real> A( nparams * nparams);
1729  for(size_t r=0, rc=0; r<nparams; ++r) {
1730  for(size_t c=0; c<nparams; ++c, ++rc) {
1731  double Arc = 0.0;
1732  for(size_t i=0, ir=r, ic=c; i<npts; ++i, ir+=nparams, ic+=nparams)
1733  Arc += J[ir] * J[ic];
1734  A[rc] = Arc;
1735  }
1736  }
1737 
1738  std::vector<Real> b( nparams );
1739  for(size_t r=0; r<nparams; ++r) {
1740  Real br = 0.0;
1741  for(size_t i=0, ir=r; i<npts; ++i, ir+=nparams)
1742  br += J[ir] * err[i];
1743  b[r] = br;
1744  }
1745 
1746  // try decreasing damping first
1747  // if not successful try increasing damping until it results in a decrease in the error
1748  lambda0 /= nu;
1749  for(int l=-1; l<1000; ++l) {
1750 
1751  LinearSolveDamped(A, b, lambda0, delta );
1752 
1753  std::vector<double> cc_0(cc); for(unsigned int i=0; i<n; ++i) cc_0[i] += delta[i];
1754  std::vector<double> aa_0(aa); for(unsigned int i=0; i<n; ++i) aa_0[i] += delta[i+n];
1755 
1756  // if any of the exponents are negative the step is too large and need to increase damping
1757  bool step_too_large = false;
1758  for(unsigned int i=0; i<n; ++i)
1759  if (aa_0[i] < 0.0) {
1760  step_too_large = true;
1761  break;
1762  }
1763  if (!step_too_large) {
1764  std::vector<double> err_0(npts);
1765  for(unsigned int i=0; i<npts; ++i) {
1766  const double x = xi[i];
1767  err_0[i] = (fstg(zeta, x) - fngtg(cc_0, aa_0, x)) * std::sqrt(ww(x));
1768  }
1769  const double errnorm_0 = norm(err_0)/std::sqrt((double)npts);
1770  if (errnorm_0 < errnormI) {
1771  cc = cc_0;
1772  aa = aa_0;
1773  break;
1774  }
1775  else // step lead to increase of the error -- try dampening a bit more
1776  lambda0 *= nu;
1777  }
1778  else // too large of a step
1779  lambda0 *= nu;
1780  } // done adjusting the damping factor
1781 
1782  } // end of iterative minimization
1783 
1784  // if reached max # of iterations throw if the error is too terrible
1785  assert(not (iter == maxniter && errnormI > 1e-10));
1786 
1787  for(unsigned int i=0; i<n; ++i)
1788  geminal[i] = std::make_pair(aa[i], cc[i]);
1789  }
1790 #endif
1791 
1792 } // end of namespace libint2
1793 
1794 #endif // C++ only
1795 #endif // header guard
holds tables of expensive quantities
Definition: boys.h:55
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition: vector_x86.h:39
FmEval_Chebyshev3(int m_max, double=0.0)
Definition: boys.h:255
int max_m() const
Definition: boys.h:289
static const std::shared_ptr< FmEval_Taylor > & instance(unsigned int mmax, Real precision)
Singleton interface allows to manage the lone instance; adjusts max m and precision values as needed ...
Definition: boys.h:854
FmEval_Chebyshev7(int m_max, double=0.0)
Definition: boys.h:540
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
void eval(Real *Fm, Real x, int m_max) const
fills in Fm with computed Boys function values for m in [0,mmax]
Definition: boys.h:580
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:23
void eval(Real *Fm, Real T, int mmax) const
computes Boys function values with m index in range [0,mmax]
Definition: boys.h:881
static void eval(Real *Fm, Real T, size_t mmax, Real absolute_precision)
fills up an array of Fm(T) for m in [0,mmax]
Definition: boys.h:174
static void eval_G_m1_0(Real *result, Real T, Real U)
computes and , both are needed for Yukawa and Slater integrals
Definition: boys.h:1280
some evaluators need thread-local scratch, but most don&#39;t
Definition: boys.h:1325
Computes the Boys function, $ F_m (T) = ^1 u^{2m} (-T u^2) \, { d}u $, using multi-algorithm approach...
Definition: boys.h:205
Real precision() const
Definition: boys.h:874
int max_m() const
Definition: boys.h:872
static Real eval_G0(Real T, Real U)
computes a single value of G_0(T,U)
Definition: boys.h:1267
static void eval_yukawa_s1(Real *Gm, Real T, Real U, size_t mmax)
Scheme 1 of Ten-no: upward recursion from and T must be non-zero!
Definition: boys.h:1109
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
FmEval_Taylor(unsigned int mmax, Real precision)
Constructs the object to be able to compute Boys funcion for m in [0,mmax], with relative precision...
Definition: boys.h:759
core integral for Yukawa and exponential interactions
Definition: boys.h:1082
Computes the Boys function, $ F_m (T) = ^1 u^{2m} (-T u^2) \, { d}u $, using 7-th order Chebyshev int...
Definition: boys.h:524
Definition: engine.h:834
Definition: boys.h:1320
void eval_yukawa_s2(Real *Gm, Real T, Real U, size_t mmax)
Scheme 2 of Ten-no:
Definition: boys.h:1146
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
Computes the Boys function, $ F_m (T) = ^1 u^{2m} (-T u^2) \, { d}u $, using 3-rd order Chebyshev int...
Definition: boys.h:239
void convert(T *a) const
writes this to a
Definition: vector_x86.h:129
Computes the Boys function, , using single algorithm (asymptotic expansion).
Definition: boys.h:139
void eval_yukawa_Gm0U(Real *Gm0U, Real U, int mmax, int mmin=-1)
computes prerequisites for MacLaurin expansion of Gm(T,U) for m in [-1,mmax); uses Ten-no&#39;s prescript...
Definition: boys.h:1197
void eval(Real *Fm, Real x, int m_max) const
fills in Fm with computed Boys function values for m in [0,mmax]
Definition: boys.h:295
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
int max_m() const
Definition: boys.h:574
static Real eval(Real T, size_t m, Real absolute_precision)
computes a single value of using MacLaurin series.
Definition: boys.h:142
static Real eval_Gm1(Real T, Real U)
computes a single value of G_{-1}(T,U)
Definition: boys.h:1255
static Real eval_MacLaurinT(Real T, Real U, size_t m, Real absolute_precision)
computes a single value of G(T,U) using MacLaurin series.
Definition: boys.h:1297
void eval_yukawa_s3(Real *Gm, Real T, Real U, size_t mmax)
Scheme 3 of Ten-no:
Definition: boys.h:1174
Computes the Boys function, $ F_m (T) = ^1 u^{2m} (-T u^2) \, { d}u $, using Taylor interpolation of ...
Definition: boys.h:752
Real precision() const
Definition: boys.h:1095
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:125
static void eval(Real *Fm, Real t, size_t mmax, Real absolute_precision)
fills up an array of Fm(T) for m in [0,mmax]
Definition: boys.h:212