Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:48:49

0001 #!/usr/bin/env python
0002 #
0003 # Copyright (c) 2019 Opticks Team. All Rights Reserved.
0004 #
0005 # This file is part of Opticks
0006 # (see https://bitbucket.org/simoncblyth/opticks).
0007 #
0008 # Licensed under the Apache License, Version 2.0 (the "License"); 
0009 # you may not use this file except in compliance with the License.  
0010 # You may obtain a copy of the License at
0011 #
0012 #   http://www.apache.org/licenses/LICENSE-2.0
0013 #
0014 # Unless required by applicable law or agreed to in writing, software 
0015 # distributed under the License is distributed on an "AS IS" BASIS, 
0016 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
0017 # See the License for the specific language governing permissions and 
0018 # limitations under the License.
0019 #
0020 
0021 import numpy as np
0022 import os, logging, builtins
0023 import itertools
0024 
0025 try: 
0026     from hashlib import md5 
0027 except ImportError: 
0028     from md5 import md5 
0029 
0030 
0031 
0032 log = logging.getLogger(__name__) 
0033 
0034 try:
0035     from scipy.stats import chi2 as _chi2
0036 except ImportError:
0037     _chi2 = None 
0038 
0039 
0040 def np_string(a):
0041     s = np.array2string(a, separator=",")
0042     return s.strip('[]').replace(' ', '')
0043 
0044 
0045 def np_tostring(a, scale=1e6):
0046     b = a.copy()
0047     b[:,0] /= scale
0048     return " ".join(list(map(str,b.ravel()))) 
0049 
0050 def np_fromstring(values, coldim=2, scale=1e6):
0051     a = np.fromstring(values, dtype=np.float32, sep=' ').reshape(-1, coldim) 
0052     a[:,0] *= scale     
0053     return a
0054 
0055 np_fromtxt = lambda txt, dtype=np.float32:np.fromiter( map(float, txt.replace("("," ").replace(")"," ").replace(","," ").strip().split() ), dtype=dtype ) 
0056 
0057 
0058 
0059 class Buf(np.ndarray): pass
0060 
0061 
0062 
0063 def ahash(a):
0064     """
0065     * http://stackoverflow.com/questions/16589791/most-efficient-property-to-hash-for-numpy-array    
0066 
0067     ::
0068 
0069         hash(a.data)
0070         Out[7]: 7079931724019902235
0071 
0072         In [8]: "%x" % hash(a.data)
0073         Out[8]: '6240f8645439a71b'
0074 
0075     """
0076     a.flags.writeable = False
0077     return "%x" % hash(a.data)
0078 
0079 
0080 
0081 
0082 def find_ranges(i):
0083     """  
0084     :param i: sorted list of integers
0085 
0086     E.g. given the set {0, 1, 2, 3, 4, 7, 8, 9, 11} I want to get { {0,4}, {7,9}, {11,11} }.
0087 
0088     http://stackoverflow.com/questions/4628333/converting-a-list-of-integers-into-range-in-python
0089     """
0090     func = lambda x,y:y-x
0091 
0092     for a, b in itertools.groupby(enumerate(i), func):
0093         b = list(b)
0094         yield b[0][1], b[-1][1]
0095     pass
0096 
0097 def count_unique_truncating(vals):
0098     """  
0099     http://stackoverflow.com/questions/10741346/numpy-frequency-counts-for-unique-values-in-an-array
0100     """
0101     uniq = np.unique(vals)
0102     bins = uniq.searchsorted(vals)
0103     return np.vstack((uniq, np.bincount(bins))).T
0104 
0105 def count_unique_old(vals):
0106     """
0107     """ 
0108     uniq = np.unique(vals)
0109     bins = uniq.searchsorted(vals)
0110     cnts = np.bincount(bins)
0111     return np.vstack((uniq, cnts.astype(np.uint64))).T
0112 
0113 def count_unique_new(vals):
0114     """
0115     np.unique return_counts option requires at least numpy 1.9 
0116     """
0117     uniq, cnts = np.unique(vals, return_counts=True)
0118     return np.vstack((uniq, cnts.astype(np.uint64))).T 
0119 
0120 def count_unique(vals):
0121     try:
0122         cu = count_unique_new(vals)
0123     except TypeError:
0124         cu = count_unique_old(vals)
0125     pass
0126     return cu 
0127 
0128 
0129 def unique2D_subarray(a):
0130     """
0131     https://stackoverflow.com/questions/40674696/numpy-unique-2d-sub-array 
0132     """
0133     dtype1 = np.dtype((np.void, a.dtype.itemsize * np.prod(a.shape[1:])))  #  eg for (10,4,4) -> np.dtype( (np.void, 
0134     b = np.ascontiguousarray(a.reshape(a.shape[0],-1)).view(dtype1)
0135     return a[np.unique(b, return_index=1)[1]]
0136 
0137 
0138 
0139 
0140 def np_digest(a):
0141     """
0142     https://stackoverflow.com/questions/5386694/fast-way-to-hash-numpy-objects-for-caching
0143 
0144     file digest includes the header, not just the data : so will not match this 
0145 
0146     """
0147     dig = md5()
0148     data = np.ascontiguousarray(a.view(np.uint8))
0149     dig.update(data)
0150     return dig.hexdigest()
0151 
0152 
0153 def array_digest(a):
0154     """
0155     https://stackoverflow.com/questions/5386694/fast-way-to-hash-numpy-objects-for-caching
0156 
0157     file digest includes the header, not just the data : so will not match this 
0158     """
0159     return np_digest(a)
0160 
0161 
0162 def cus(s):
0163     """
0164     :param s: array of shape (n,)
0165     :return cus: count unique sorted array of shape (num_unique, 2)
0166     """
0167     u,c = np.unique(s, return_counts=True) 
0168     o = np.argsort(c)[::-1]
0169     cu = np.zeros( (len(u),2), dtype=np.uint64 )
0170     cu[:,0] = u[o]
0171     cu[:,1] = c[o]
0172     return cu 
0173 
0174 
0175 
0176 
0177 def count_unique_sorted(vals):
0178     """
0179     Older numpy has problem with the argsort line when cu is empty
0180     """
0181     #vals = vals.astype(np.uint64)  cannot do this with -ve pdgcode
0182     cu = count_unique(vals) 
0183     if len(cu) != 0:
0184         cu = cu[np.argsort(cu[:,1])[::-1]]  # descending frequency order
0185     pass
0186     return cu.astype(np.uint64)
0187 
0188 def vnorm(a):
0189     """
0190     Older numpy lacks the third axis argument form of np.linalg.norm 
0191     so this replaces it::
0192 
0193         #r = np.linalg.norm(xyz[:,:2], 2, 1)  
0194         r = vnorm(xyz[:,:2]) 
0195 
0196     """
0197     return np.sqrt(np.sum(a*a,1))
0198 
0199 vnorm_ = lambda _:np.sqrt(np.sum(_*_,1))
0200 
0201 costheta_ = lambda a,b:np.sum(a * b, axis = 1)/(vnorm(a)*vnorm(b))
0202 
0203 
0204 
0205 
0206 def chi2_pvalue( c2obs, ndf ):
0207     """
0208     :param c2obs: observed chi2 value
0209     :param ndf: 
0210     :return pvalue:  1 - _chi2.cdf(c2obs, ndf) 
0211 
0212     * https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
0213 
0214     For ndf:10 you want to know if a chi2 value of 18.307
0215     should be regarded as consistent with a null hypothesis.
0216     By looking at the probability that the chi2 where more than 
0217     that::
0218 
0219         In [2]: from scipy.stats import chi2 as _chi2
0220         In [27]: 1 - _chi2.cdf(18.307, 10)  ## chi2 of 18.307 for ndf:10 : on the edge of being significant deviation
0221         Out[27]: 0.05000058909139815
0222 
0223         In [4]: 1-_chi2.cdf(20, 10 )   ## chi2 of 20 for ndf:10 : regarded as significant deviation from null 
0224         Out[4]: 0.02925268807696113
0225 
0226     A p-value of less than or equal to 0.05 is regarded as evidence of a
0227     statistically significant result, and in these cases, the null hypothesis
0228     should be rejected in favor of the alternative hypothesis.
0229 
0230 
0231     # probability of getting a chi2 <= c2obs for the ndf 
0232 
0233     # https://onlinecourses.science.psu.edu/stat414/node/147
0234 
0235     :google:`interpret chi2 values`
0236 
0237     ::
0238 
0239 
0240         from scipy.stats import chi2 as _chi2
0241 
0242         In [49]: _chi2.cdf( 15.99, 10 )   ## for ndf 10, probability for chi2 < 15.99 is 0.900
0243         Out[49]: 0.90008098002000925
0244 
0245         In [53]: _chi2.cdf( range(10,21), 10 )    ## probability of getting chi2 < the value
0246         Out[53]: array([ 0.5595,  0.6425,  0.7149,  0.7763,  0.827 ,  0.8679,  0.9004,  0.9256,  0.945 ,  0.9597,  0.9707])
0247 
0248         In [54]: 1 - _chi2.cdf( range(10,21), 10 )  ## probability of getting chi2 > the value
0249         Out[54]: array([ 0.4405,  0.3575,  0.2851,  0.2237,  0.173 ,  0.1321,  0.0996,  0.0744,  0.055 ,  0.0403,  0.0293])
0250 
0251 
0252     * https://en.wikipedia.org/wiki/Chi-squared_distribution
0253 
0254     The p-value is the probability of observing a test statistic at least as
0255     extreme in a chi-squared distribution. Accordingly, since the cumulative
0256     distribution function (CDF) for the appropriate degrees of freedom (df) gives
0257     the probability of having obtained a value less extreme than this point,
0258     subtracting the CDF value from 1 gives the p-value. The table below gives a
0259     number of p-values matching to chi2 for the first 10 degrees of freedom.
0260 
0261     A low p-value indicates greater statistical significance, i.e. greater
0262     confidence that the observed deviation from the null hypothesis is significant.
0263     A p-value of 0.05 is often used as a cutoff between significant and
0264     not-significant results.
0265 
0266     Of course for Opticks-CFG4 comparisons I wish to see no significant 
0267     deviations, so I want the p-value to be close to 1 indicating nothing of significance.
0268 
0269     ::
0270 
0271         In [56]: _chi2.cdf( [3.94,4.87,6.18,7.27,9.34,11.78,13.44,15.99,18.31,23.21,29.59], 10 )
0272         Out[56]: array([ 0.05  ,  0.1003,  0.2001,  0.3003,  0.4998,  0.6999,  0.7999,  0.9001,  0.95  ,  0.99  ,  0.999 ])
0273         ## probability of a chi2 value less than those values for ndf 10 
0274 
0275         In [57]: 1 - _chi2.cdf( [3.94,4.87,6.18,7.27,9.34,11.78,13.44,15.99,18.31,23.21,29.59], 10 )   ## P-value (Probability)
0276         Out[57]: array([ 0.95  ,  0.8997,  0.7999,  0.6997,  0.5002,  0.3001,  0.2001,  0.0999,  0.05  ,  0.01  ,  0.001 ])
0277         ## probability of a chi2 greater than those values for ndf 10 
0278 
0279 
0280     * ~/opticks_refs/PoissonConsistency.pdf
0281     * http://www.hep.caltech.edu/~fcp/statistics/hypothesisTest/PoissonConsistency/PoissonConsistency.pdf 
0282 
0283     * https://www.scribbr.com/statistics/chi-square-distributions/
0284     * https://www.scribbr.com/statistics/chi-square-tests/
0285     * https://www.scribbr.com/statistics/chi-square-distribution-table/
0286     * https://www.scribbr.com/wp-content/uploads/2022/05/Chi-square-table.pdf
0287 
0288     In [20]: df = np.arange(1,31)
0289 
0290     ## so called 5% critical values : to judge null hypotheses for different 
0291     ## degrees of freedom 
0292     ## Alternatively 
0293 
0294     In [22]: df = np.arange(1,31)
0295 
0296     In [23]: cr5 = _chi2.ppf( 0.95,df)
0297 
0298     In [24]: _chi2.cdf( cr5 , df)
0299     Out[24]: array([0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95])
0300 
0301     In [25]: np.c_[df,cr5]
0302     Out[25]: 
0303     array([[ 1.   ,  3.841],
0304            [ 2.   ,  5.991],
0305            [ 3.   ,  7.815],
0306            [ 4.   ,  9.488],
0307            [ 5.   , 11.07 ],
0308            [ 6.   , 12.592],
0309            [ 7.   , 14.067],
0310            [ 8.   , 15.507],
0311            [ 9.   , 16.919],
0312            [10.   , 18.307],
0313            [11.   , 19.675],
0314            [12.   , 21.026],
0315            [13.   , 22.362],
0316            [14.   , 23.685],
0317            [15.   , 24.996],
0318            [16.   , 26.296],
0319            [17.   , 27.587],
0320 
0321 
0322     * https://www.nevis.columbia.edu/~seligman/root-class/html/appendix/statistics/ChiSquaredDOF.html
0323 
0324     Chi2 critical values
0325 
0326     * https://www.itl.nist.gov/div898/handbook/eda/section3/eda3674.htm
0327 
0328     """
0329     if _chi2 is None:
0330         return 1
0331 
0332     p_value = 1 - _chi2.cdf(c2obs, ndf) 
0333     return p_value
0334 
0335 
0336 def chi2one(a, b):
0337     return (a-b)*(a-b)/(a+b)
0338 
0339 def chi2(a_, b_, cut=30):
0340     """
0341     :param a_: array of counts
0342     :param b_: array of counts
0343     :param cut: applied to sum of and and b excludes low counts from the chi2 
0344     :return c2,c2n,c2c:
0345 
0346     *c2*
0347          array with elementwise square of difference over the sum 
0348          (masked by the cut on the sum, zeros provided for low stat entries)
0349     *c2n*
0350          number of bins within the count cut
0351     *c2c*
0352          number of bins not within count cut 
0353   
0354     ::
0355 
0356         c2, c2n, c2c = chi2(a, b)
0357         c2ndf = c2.sum()/c2n
0358 
0359     # ChiSquared or KS
0360     # http://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm 
0361     # https://en.wikipedia.org/wiki/Propagation_of_uncertainty
0362     # http://stats.stackexchange.com/questions/7400/how-to-assess-the-similarity-of-two-histograms
0363 
0364 
0365     # http://www.hep.caltech.edu/~fcp/statistics/hypothesisTest/PoissonConsistency/PoissonConsistency.pdf
0366     # ~/opticks_refs/PoissonConsistency.pdf   Link now dead ?    
0367     # "Testing Consistency of Two Histograms", Frank Porter, California Institute of Technology
0368 
0369     """
0370     a = a_.astype(np.float32)
0371     b = b_.astype(np.float32)
0372 
0373     msk = a+b > cut
0374     c2 = np.zeros_like(a)
0375     c2[msk] = np.power(a-b,2)[msk]/(a+b)[msk]
0376 
0377     c2n = len(a[msk])
0378     c2c = len(a[~msk]) 
0379 
0380     assert c2n + c2c == len(a)
0381 
0382     return c2, c2n, c2c
0383 
0384 def ratio(a, b):
0385     m = np.logical_and(a > 0, b > 0)
0386 
0387     ab = np.zeros((len(a),2))
0388     ba = np.zeros((len(a),2))
0389 
0390     ab[m,0] = a[m]/b[m] 
0391     ab[m,1] = np.sqrt(a[m])/b[m] 
0392 
0393     ba[m,0] = b[m]/a[m]
0394     ba[m,1] = np.sqrt(b[m])/a[m]
0395 
0396     return ab, ba
0397 
0398 
0399 
0400 
0401 
0402 def test_count_unique_(fn, a):
0403     """
0404     count_unique appears to go via floats which 
0405     looses precision for large numbers
0406     """
0407 
0408     aa = np.array([ a,a,a ], dtype=np.uint64 )
0409 
0410     cu = fn(aa)
0411     n = cu[0,1]
0412     assert n == 3
0413 
0414     a_ = np.uint64(cu[0,0])
0415 
0416     ok = a_ == a
0417 
0418     if ok:
0419          msg = "OK" 
0420     else:
0421          msg = "FAIL"
0422     pass
0423     log.info("test_count_unique_ %16x %16x %s  %s %s  " % (a, a_, msg, a, a_) )
0424 
0425 
0426 def test_count_unique():
0427 
0428     log.info("test_count_unique")
0429     vals=[0xfedcba9876543210,0xffffffffffffffff]
0430 
0431     msk_ = lambda n:(1 << 4*n) - 1 
0432     for n in range(16):
0433         msk = msk_(n)    
0434         vals.append(msk)
0435 
0436     for fn in [count_unique_new, count_unique_old, count_unique_truncating]:
0437         log.info(fn.__name__)
0438         map(lambda v:test_count_unique_(fn, v), vals)
0439 
0440 
0441 def test_chi2():
0442     a = np.array([0,100,500,500],dtype=np.int32)
0443     b = np.array([0,100,500,0],  dtype=np.int32)
0444 
0445     c2,c2n,c2c = chi2(a,b, cut=30)
0446 
0447 
0448 def test_count_unique_2():
0449     log.info("test_count_unique_2")
0450     t = np.array( [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], dtype=np.uint32 )
0451     x = np.array( [[1,1],[2,2],[3,3],[4,4], [5,5]], dtype=np.uint64 )
0452 
0453     r1 = count_unique( t )
0454     assert np.all( r1 == x )
0455 
0456 
0457 def test_count_unique_sorted():
0458     log.info("test_count_unique_sorted")
0459     t = np.array( [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], dtype=np.uint32 )
0460     y = np.array( [[5,5],[4,4],[3,3],[2,2], [1,1]], dtype=np.uint64 )
0461 
0462     r = count_unique_sorted( t )
0463     assert np.all( r == y )
0464 
0465 
0466 def test_count_unique_sorted_empty():
0467     log.info("test_count_unique_sorted_empty np.__version__ %s " % np.__version__ )
0468     t = np.array( [], dtype=np.uint32 )
0469     r = count_unique_sorted( t )
0470 
0471     u = np.array( [], dtype=np.uint64 )
0472     c = np.array( [], dtype=np.uint64 )
0473     x = np.vstack( (u,c) ).T 
0474 
0475     assert np.all( r == x )
0476 
0477 
0478 
0479 if __name__ == '__main__':
0480 
0481     logging.basicConfig(level=logging.INFO)
0482     log.info("np.__version__ %s " % np.__version__ )
0483 
0484     #test_count_unique()
0485     #test_chi2()
0486     test_count_unique_2()
0487     test_count_unique_sorted()
0488     test_count_unique_sorted_empty()
0489 
0490 
0491 
0492   
0493 
0494 
0495