include/vdt/tanh.h

0001 /*
0002  * tanh.h
0003  * The basic idea is to exploit Pade polynomials.
0004  * Implemented by Manuel Schiller for LHCb.
0005  *
0006  *  Created on: Sep 23, 2017
0007  *      Author: Paul Seyfert, Manuel Schiller
0008  */
0009
0010 /*
0011  * VDT is free software: you can redistribute it and/or modify
0012  * it under the terms of the GNU Lesser Public License as published by
0013  * the Free Software Foundation, either version 3 of the License, or
0014  * (at your option) any later version.
0015  *
0016  * This program is distributed in the hope that it will be useful,
0017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0019  * GNU Lesser Public License for more details.
0020  *
0021  * You should have received a copy of the GNU Lesser Public License
0022  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
0023  */
0024
0025 #ifndef TANH_H_
0026 #define TANH_H_
0027
0028 #include "vdtcore_common.h"
0029
0030 namespace vdt{
0031
0032
0033
0034
0035 /// Fast tanh implementation double precision
0036 inline double fast_tanh(double x){
0037     // for very large |x| > 20, tanh(x) is x/|x| anyway (at least to double
0038     // precision)
0039     //
0040     // NB: branch-free code takes longer to execute
0041     if (std::abs(x) > 20.) return std::copysign(1., x);
0042     // strategy for large arguments: tanh(2x) = 2 tanh(x)/(1 + tanh^2(x))
0043     // idea is to use this "argument halving" a couple of times, and use a
0044     // very short Padé approximation for the rest of the way
0045     const auto xx = x * 0.125;
0046     const auto xx2 = xx * xx;
0047     const auto numer = 135135 + xx2 * (17325 + xx2 * ( 378 + xx2 *  1));
0048     const auto denom = 135135 + xx2 * (62370 + xx2 * (3150 + xx2 * 28));
0049
0050     auto tanh = xx * numer / denom;
0051     tanh = 2 * tanh / (tanh * tanh + 1);
0052     tanh = 2 * tanh / (tanh * tanh + 1);
0053     return 2 * tanh / (tanh * tanh + 1);
0054 }
0055
0056 //------------------------------------------------------------------------------
0057 /// Fast tanh implementation single precision
0058 inline float fast_tanhf( float x ) {
0059     // same strategy as double version above, but even shorter Padé
0060     // approximation is sufficient for float
0061     //
0062     // NB: branch-free code takes longer to execute
0063     if (std::abs(x) > 9.1f) return std::copysign(1.f, x);
0064     const auto xx = x * 0.125f;
0065     const auto xx2 = xx * xx;
0066     auto tanh = xx * (xx2 + 15) / (6 * xx2 + 15);
0067     tanh = 2 * tanh / (tanh * tanh + 1);
0068     tanh = 2 * tanh / (tanh * tanh + 1);
0069     return 2 * tanh / (tanh * tanh + 1);
0070 }
0071
0072 //------------------------------------------------------------------------------
0073 // Vector signatures
0074
0075 void tanhv(const uint32_t size, double const * __restrict__ iarray, double* __restrict__ oarray);
0076 void fast_tanhv(const uint32_t size, double const * __restrict__ iarray, double* __restrict__ oarray);
0077 void tanhfv(const uint32_t size, float const * __restrict__ iarray, float* __restrict__ oarray);
0078 void fast_tanhfv(const uint32_t size, float const * __restrict__ iarray, float* __restrict__ oarray);
0079
0080 }// end of vdt
0081
0082 #endif // end of tanh