eic-opticks/ana/nbase.py

0001 #!/usr/bin/env python
0002 #
0003 # Copyright (c) 2019 Opticks Team. All Rights Reserved.
0004 #
0005 # This file is part of Opticks
0006 # (see https://bitbucket.org/simoncblyth/opticks).
0007 #
0008 # Licensed under the Apache License, Version 2.0 (the "License");
0009 # you may not use this file except in compliance with the License.
0010 # You may obtain a copy of the License at
0011 #
0012 #   http://www.apache.org/licenses/LICENSE-2.0
0013 #
0014 # Unless required by applicable law or agreed to in writing, software
0015 # distributed under the License is distributed on an "AS IS" BASIS,
0016 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0017 # See the License for the specific language governing permissions and
0018 # limitations under the License.
0019 #
0020
0021 import numpy as np
0022 import os, logging, builtins
0023 import itertools
0024
0025 try:
0026     from hashlib import md5
0027 except ImportError:
0028     from md5 import md5
0029
0030
0031
0032 log = logging.getLogger(__name__)
0033
0034 try:
0035     from scipy.stats import chi2 as _chi2
0036 except ImportError:
0037     _chi2 = None
0038
0039
0040 def np_string(a):
0041     s = np.array2string(a, separator=",")
0042     return s.strip('[]').replace(' ', '')
0043
0044
0045 def np_tostring(a, scale=1e6):
0046     b = a.copy()
0047     b[:,0] /= scale
0048     return " ".join(list(map(str,b.ravel())))
0049
0050 def np_fromstring(values, coldim=2, scale=1e6):
0051     a = np.fromstring(values, dtype=np.float32, sep=' ').reshape(-1, coldim)
0052     a[:,0] *= scale
0053     return a
0054
0055 np_fromtxt = lambda txt, dtype=np.float32:np.fromiter( map(float, txt.replace("("," ").replace(")"," ").replace(","," ").strip().split() ), dtype=dtype )
0056
0057
0058
0059 class Buf(np.ndarray): pass
0060
0061
0062
0063 def ahash(a):
0064     """
0065     * http://stackoverflow.com/questions/16589791/most-efficient-property-to-hash-for-numpy-array
0066
0067     ::
0068
0069         hash(a.data)
0070         Out[7]: 7079931724019902235
0071
0072         In [8]: "%x" % hash(a.data)
0073         Out[8]: '6240f8645439a71b'
0074
0075     """
0076     a.flags.writeable = False
0077     return "%x" % hash(a.data)
0078
0079
0080
0081
0082 def find_ranges(i):
0083     """
0084     :param i: sorted list of integers
0085
0086     E.g. given the set {0, 1, 2, 3, 4, 7, 8, 9, 11} I want to get { {0,4}, {7,9}, {11,11} }.
0087
0088     http://stackoverflow.com/questions/4628333/converting-a-list-of-integers-into-range-in-python
0089     """
0090     func = lambda x,y:y-x
0091
0092     for a, b in itertools.groupby(enumerate(i), func):
0093         b = list(b)
0094         yield b[0][1], b[-1][1]
0095     pass
0096
0097 def count_unique_truncating(vals):
0098     """
0099     http://stackoverflow.com/questions/10741346/numpy-frequency-counts-for-unique-values-in-an-array
0100     """
0101     uniq = np.unique(vals)
0102     bins = uniq.searchsorted(vals)
0103     return np.vstack((uniq, np.bincount(bins))).T
0104
0105 def count_unique_old(vals):
0106     """
0107     """
0108     uniq = np.unique(vals)
0109     bins = uniq.searchsorted(vals)
0110     cnts = np.bincount(bins)
0111     return np.vstack((uniq, cnts.astype(np.uint64))).T
0112
0113 def count_unique_new(vals):
0114     """
0115     np.unique return_counts option requires at least numpy 1.9
0116     """
0117     uniq, cnts = np.unique(vals, return_counts=True)
0118     return np.vstack((uniq, cnts.astype(np.uint64))).T
0119
0120 def count_unique(vals):
0121     try:
0122         cu = count_unique_new(vals)
0123     except TypeError:
0124         cu = count_unique_old(vals)
0125     pass
0126     return cu
0127
0128
0129 def unique2D_subarray(a):
0130     """
0131     https://stackoverflow.com/questions/40674696/numpy-unique-2d-sub-array
0132     """
0133     dtype1 = np.dtype((np.void, a.dtype.itemsize * np.prod(a.shape[1:])))  #  eg for (10,4,4) -> np.dtype( (np.void,
0134     b = np.ascontiguousarray(a.reshape(a.shape[0],-1)).view(dtype1)
0135     return a[np.unique(b, return_index=1)[1]]
0136
0137
0138
0139
0140 def np_digest(a):
0141     """
0142     https://stackoverflow.com/questions/5386694/fast-way-to-hash-numpy-objects-for-caching
0143
0144     file digest includes the header, not just the data : so will not match this
0145
0146     """
0147     dig = md5()
0148     data = np.ascontiguousarray(a.view(np.uint8))
0149     dig.update(data)
0150     return dig.hexdigest()
0151
0152
0153 def array_digest(a):
0154     """
0155     https://stackoverflow.com/questions/5386694/fast-way-to-hash-numpy-objects-for-caching
0156
0157     file digest includes the header, not just the data : so will not match this
0158     """
0159     return np_digest(a)
0160
0161
0162 def cus(s):
0163     """
0164     :param s: array of shape (n,)
0165     :return cus: count unique sorted array of shape (num_unique, 2)
0166     """
0167     u,c = np.unique(s, return_counts=True)
0168     o = np.argsort(c)[::-1]
0169     cu = np.zeros( (len(u),2), dtype=np.uint64 )
0170     cu[:,0] = u[o]
0171     cu[:,1] = c[o]
0172     return cu
0173
0174
0175
0176
0177 def count_unique_sorted(vals):
0178     """
0179     Older numpy has problem with the argsort line when cu is empty
0180     """
0181     #vals = vals.astype(np.uint64)  cannot do this with -ve pdgcode
0182     cu = count_unique(vals)
0183     if len(cu) != 0:
0184         cu = cu[np.argsort(cu[:,1])[::-1]]  # descending frequency order
0185     pass
0186     return cu.astype(np.uint64)
0187
0188 def vnorm(a):
0189     """
0190     Older numpy lacks the third axis argument form of np.linalg.norm
0191     so this replaces it::
0192
0193         #r = np.linalg.norm(xyz[:,:2], 2, 1)
0194         r = vnorm(xyz[:,:2])
0195
0196     """
0197     return np.sqrt(np.sum(a*a,1))
0198
0199 vnorm_ = lambda _:np.sqrt(np.sum(_*_,1))
0200
0201 costheta_ = lambda a,b:np.sum(a * b, axis = 1)/(vnorm(a)*vnorm(b))
0202
0203
0204
0205
0206 def chi2_pvalue( c2obs, ndf ):
0207     """
0208     :param c2obs: observed chi2 value
0209     :param ndf:
0210     :return pvalue:  1 - _chi2.cdf(c2obs, ndf)
0211
0212     * https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
0213
0214     For ndf:10 you want to know if a chi2 value of 18.307
0215     should be regarded as consistent with a null hypothesis.
0216     By looking at the probability that the chi2 where more than
0217     that::
0218
0219         In [2]: from scipy.stats import chi2 as _chi2
0220         In [27]: 1 - _chi2.cdf(18.307, 10)  ## chi2 of 18.307 for ndf:10 : on the edge of being significant deviation
0221         Out[27]: 0.05000058909139815
0222
0223         In [4]: 1-_chi2.cdf(20, 10 )   ## chi2 of 20 for ndf:10 : regarded as significant deviation from null
0224         Out[4]: 0.02925268807696113
0225
0226     A p-value of less than or equal to 0.05 is regarded as evidence of a
0227     statistically significant result, and in these cases, the null hypothesis
0228     should be rejected in favor of the alternative hypothesis.
0229
0230
0231     # probability of getting a chi2 <= c2obs for the ndf
0232
0233     # https://onlinecourses.science.psu.edu/stat414/node/147
0234
0235     :google:`interpret chi2 values`
0236
0237     ::
0238
0239
0240         from scipy.stats import chi2 as _chi2
0241
0242         In [49]: _chi2.cdf( 15.99, 10 )   ## for ndf 10, probability for chi2 < 15.99 is 0.900
0243         Out[49]: 0.90008098002000925
0244
0245         In [53]: _chi2.cdf( range(10,21), 10 )    ## probability of getting chi2 < the value
0246         Out[53]: array([ 0.5595,  0.6425,  0.7149,  0.7763,  0.827 ,  0.8679,  0.9004,  0.9256,  0.945 ,  0.9597,  0.9707])
0247
0248         In [54]: 1 - _chi2.cdf( range(10,21), 10 )  ## probability of getting chi2 > the value
0249         Out[54]: array([ 0.4405,  0.3575,  0.2851,  0.2237,  0.173 ,  0.1321,  0.0996,  0.0744,  0.055 ,  0.0403,  0.0293])
0250
0251
0252     * https://en.wikipedia.org/wiki/Chi-squared_distribution
0253
0254     The p-value is the probability of observing a test statistic at least as
0255     extreme in a chi-squared distribution. Accordingly, since the cumulative
0256     distribution function (CDF) for the appropriate degrees of freedom (df) gives
0257     the probability of having obtained a value less extreme than this point,
0258     subtracting the CDF value from 1 gives the p-value. The table below gives a
0259     number of p-values matching to chi2 for the first 10 degrees of freedom.
0260
0261     A low p-value indicates greater statistical significance, i.e. greater
0262     confidence that the observed deviation from the null hypothesis is significant.
0263     A p-value of 0.05 is often used as a cutoff between significant and
0264     not-significant results.
0265
0266     Of course for Opticks-CFG4 comparisons I wish to see no significant
0267     deviations, so I want the p-value to be close to 1 indicating nothing of significance.
0268
0269     ::
0270
0271         In [56]: _chi2.cdf( [3.94,4.87,6.18,7.27,9.34,11.78,13.44,15.99,18.31,23.21,29.59], 10 )
0272         Out[56]: array([ 0.05  ,  0.1003,  0.2001,  0.3003,  0.4998,  0.6999,  0.7999,  0.9001,  0.95  ,  0.99  ,  0.999 ])
0273         ## probability of a chi2 value less than those values for ndf 10
0274
0275         In [57]: 1 - _chi2.cdf( [3.94,4.87,6.18,7.27,9.34,11.78,13.44,15.99,18.31,23.21,29.59], 10 )   ## P-value (Probability)
0276         Out[57]: array([ 0.95  ,  0.8997,  0.7999,  0.6997,  0.5002,  0.3001,  0.2001,  0.0999,  0.05  ,  0.01  ,  0.001 ])
0277         ## probability of a chi2 greater than those values for ndf 10
0278
0279
0280     * ~/opticks_refs/PoissonConsistency.pdf
0281     * http://www.hep.caltech.edu/~fcp/statistics/hypothesisTest/PoissonConsistency/PoissonConsistency.pdf
0282
0283     * https://www.scribbr.com/statistics/chi-square-distributions/
0284     * https://www.scribbr.com/statistics/chi-square-tests/
0285     * https://www.scribbr.com/statistics/chi-square-distribution-table/
0286     * https://www.scribbr.com/wp-content/uploads/2022/05/Chi-square-table.pdf
0287
0288     In [20]: df = np.arange(1,31)
0289
0290     ## so called 5% critical values : to judge null hypotheses for different
0291     ## degrees of freedom
0292     ## Alternatively
0293
0294     In [22]: df = np.arange(1,31)
0295
0296     In [23]: cr5 = _chi2.ppf( 0.95,df)
0297
0298     In [24]: _chi2.cdf( cr5 , df)
0299     Out[24]: array([0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95])
0300
0301     In [25]: np.c_[df,cr5]
0302     Out[25]:
0303     array([[ 1.   ,  3.841],
0304            [ 2.   ,  5.991],
0305            [ 3.   ,  7.815],
0306            [ 4.   ,  9.488],
0307            [ 5.   , 11.07 ],
0308            [ 6.   , 12.592],
0309            [ 7.   , 14.067],
0310            [ 8.   , 15.507],
0311            [ 9.   , 16.919],
0312            [10.   , 18.307],
0313            [11.   , 19.675],
0314            [12.   , 21.026],
0315            [13.   , 22.362],
0316            [14.   , 23.685],
0317            [15.   , 24.996],
0318            [16.   , 26.296],
0319            [17.   , 27.587],
0320
0321
0322     * https://www.nevis.columbia.edu/~seligman/root-class/html/appendix/statistics/ChiSquaredDOF.html
0323
0324     Chi2 critical values
0325
0326     * https://www.itl.nist.gov/div898/handbook/eda/section3/eda3674.htm
0327
0328     """
0329     if _chi2 is None:
0330         return 1
0331
0332     p_value = 1 - _chi2.cdf(c2obs, ndf)
0333     return p_value
0334
0335
0336 def chi2one(a, b):
0337     return (a-b)*(a-b)/(a+b)
0338
0339 def chi2(a_, b_, cut=30):
0340     """
0341     :param a_: array of counts
0342     :param b_: array of counts
0343     :param cut: applied to sum of and and b excludes low counts from the chi2
0344     :return c2,c2n,c2c:
0345
0346     *c2*
0347          array with elementwise square of difference over the sum
0348          (masked by the cut on the sum, zeros provided for low stat entries)
0349     *c2n*
0350          number of bins within the count cut
0351     *c2c*
0352          number of bins not within count cut
0353
0354     ::
0355
0356         c2, c2n, c2c = chi2(a, b)
0357         c2ndf = c2.sum()/c2n
0358
0359     # ChiSquared or KS
0360     # http://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm
0361     # https://en.wikipedia.org/wiki/Propagation_of_uncertainty
0362     # http://stats.stackexchange.com/questions/7400/how-to-assess-the-similarity-of-two-histograms
0363
0364
0365     # http://www.hep.caltech.edu/~fcp/statistics/hypothesisTest/PoissonConsistency/PoissonConsistency.pdf
0366     # ~/opticks_refs/PoissonConsistency.pdf   Link now dead ?
0367     # "Testing Consistency of Two Histograms", Frank Porter, California Institute of Technology
0368
0369     """
0370     a = a_.astype(np.float32)
0371     b = b_.astype(np.float32)
0372
0373     msk = a+b > cut
0374     c2 = np.zeros_like(a)
0375     c2[msk] = np.power(a-b,2)[msk]/(a+b)[msk]
0376
0377     c2n = len(a[msk])
0378     c2c = len(a[~msk])
0379
0380     assert c2n + c2c == len(a)
0381
0382     return c2, c2n, c2c
0383
0384 def ratio(a, b):
0385     m = np.logical_and(a > 0, b > 0)
0386
0387     ab = np.zeros((len(a),2))
0388     ba = np.zeros((len(a),2))
0389
0390     ab[m,0] = a[m]/b[m]
0391     ab[m,1] = np.sqrt(a[m])/b[m]
0392
0393     ba[m,0] = b[m]/a[m]
0394     ba[m,1] = np.sqrt(b[m])/a[m]
0395
0396     return ab, ba
0397
0398
0399
0400
0401
0402 def test_count_unique_(fn, a):
0403     """
0404     count_unique appears to go via floats which
0405     looses precision for large numbers
0406     """
0407
0408     aa = np.array([ a,a,a ], dtype=np.uint64 )
0409
0410     cu = fn(aa)
0411     n = cu[0,1]
0412     assert n == 3
0413
0414     a_ = np.uint64(cu[0,0])
0415
0416     ok = a_ == a
0417
0418     if ok:
0419          msg = "OK"
0420     else:
0421          msg = "FAIL"
0422     pass
0423     log.info("test_count_unique_ %16x %16x %s  %s %s  " % (a, a_, msg, a, a_) )
0424
0425
0426 def test_count_unique():
0427
0428     log.info("test_count_unique")
0429     vals=[0xfedcba9876543210,0xffffffffffffffff]
0430
0431     msk_ = lambda n:(1 << 4*n) - 1
0432     for n in range(16):
0433         msk = msk_(n)
0434         vals.append(msk)
0435
0436     for fn in [count_unique_new, count_unique_old, count_unique_truncating]:
0437         log.info(fn.__name__)
0438         map(lambda v:test_count_unique_(fn, v), vals)
0439
0440
0441 def test_chi2():
0442     a = np.array([0,100,500,500],dtype=np.int32)
0443     b = np.array([0,100,500,0],  dtype=np.int32)
0444
0445     c2,c2n,c2c = chi2(a,b, cut=30)
0446
0447
0448 def test_count_unique_2():
0449     log.info("test_count_unique_2")
0450     t = np.array( [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], dtype=np.uint32 )
0451     x = np.array( [[1,1],[2,2],[3,3],[4,4], [5,5]], dtype=np.uint64 )
0452
0453     r1 = count_unique( t )
0454     assert np.all( r1 == x )
0455
0456
0457 def test_count_unique_sorted():
0458     log.info("test_count_unique_sorted")
0459     t = np.array( [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], dtype=np.uint32 )
0460     y = np.array( [[5,5],[4,4],[3,3],[2,2], [1,1]], dtype=np.uint64 )
0461
0462     r = count_unique_sorted( t )
0463     assert np.all( r == y )
0464
0465
0466 def test_count_unique_sorted_empty():
0467     log.info("test_count_unique_sorted_empty np.__version__ %s " % np.__version__ )
0468     t = np.array( [], dtype=np.uint32 )
0469     r = count_unique_sorted( t )
0470
0471     u = np.array( [], dtype=np.uint64 )
0472     c = np.array( [], dtype=np.uint64 )
0473     x = np.vstack( (u,c) ).T
0474
0475     assert np.all( r == x )
0476
0477
0478
0479 if __name__ == '__main__':
0480
0481     logging.basicConfig(level=logging.INFO)
0482     log.info("np.__version__ %s " % np.__version__ )
0483
0484     #test_count_unique()
0485     #test_chi2()
0486     test_count_unique_2()
0487     test_count_unique_sorted()
0488     test_count_unique_sorted_empty()
0489
0490
0491
0492
0493
0494
0495