File indexing completed on 2026-04-09 07:48:49
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021 import numpy as np
0022 import os, logging, builtins
0023 import itertools
0024
0025 try:
0026 from hashlib import md5
0027 except ImportError:
0028 from md5 import md5
0029
0030
0031
0032 log = logging.getLogger(__name__)
0033
0034 try:
0035 from scipy.stats import chi2 as _chi2
0036 except ImportError:
0037 _chi2 = None
0038
0039
0040 def np_string(a):
0041 s = np.array2string(a, separator=",")
0042 return s.strip('[]').replace(' ', '')
0043
0044
0045 def np_tostring(a, scale=1e6):
0046 b = a.copy()
0047 b[:,0] /= scale
0048 return " ".join(list(map(str,b.ravel())))
0049
0050 def np_fromstring(values, coldim=2, scale=1e6):
0051 a = np.fromstring(values, dtype=np.float32, sep=' ').reshape(-1, coldim)
0052 a[:,0] *= scale
0053 return a
0054
0055 np_fromtxt = lambda txt, dtype=np.float32:np.fromiter( map(float, txt.replace("("," ").replace(")"," ").replace(","," ").strip().split() ), dtype=dtype )
0056
0057
0058
0059 class Buf(np.ndarray): pass
0060
0061
0062
0063 def ahash(a):
0064 """
0065 * http://stackoverflow.com/questions/16589791/most-efficient-property-to-hash-for-numpy-array
0066
0067 ::
0068
0069 hash(a.data)
0070 Out[7]: 7079931724019902235
0071
0072 In [8]: "%x" % hash(a.data)
0073 Out[8]: '6240f8645439a71b'
0074
0075 """
0076 a.flags.writeable = False
0077 return "%x" % hash(a.data)
0078
0079
0080
0081
0082 def find_ranges(i):
0083 """
0084 :param i: sorted list of integers
0085
0086 E.g. given the set {0, 1, 2, 3, 4, 7, 8, 9, 11} I want to get { {0,4}, {7,9}, {11,11} }.
0087
0088 http://stackoverflow.com/questions/4628333/converting-a-list-of-integers-into-range-in-python
0089 """
0090 func = lambda x,y:y-x
0091
0092 for a, b in itertools.groupby(enumerate(i), func):
0093 b = list(b)
0094 yield b[0][1], b[-1][1]
0095 pass
0096
0097 def count_unique_truncating(vals):
0098 """
0099 http://stackoverflow.com/questions/10741346/numpy-frequency-counts-for-unique-values-in-an-array
0100 """
0101 uniq = np.unique(vals)
0102 bins = uniq.searchsorted(vals)
0103 return np.vstack((uniq, np.bincount(bins))).T
0104
0105 def count_unique_old(vals):
0106 """
0107 """
0108 uniq = np.unique(vals)
0109 bins = uniq.searchsorted(vals)
0110 cnts = np.bincount(bins)
0111 return np.vstack((uniq, cnts.astype(np.uint64))).T
0112
0113 def count_unique_new(vals):
0114 """
0115 np.unique return_counts option requires at least numpy 1.9
0116 """
0117 uniq, cnts = np.unique(vals, return_counts=True)
0118 return np.vstack((uniq, cnts.astype(np.uint64))).T
0119
0120 def count_unique(vals):
0121 try:
0122 cu = count_unique_new(vals)
0123 except TypeError:
0124 cu = count_unique_old(vals)
0125 pass
0126 return cu
0127
0128
0129 def unique2D_subarray(a):
0130 """
0131 https://stackoverflow.com/questions/40674696/numpy-unique-2d-sub-array
0132 """
0133 dtype1 = np.dtype((np.void, a.dtype.itemsize * np.prod(a.shape[1:])))
0134 b = np.ascontiguousarray(a.reshape(a.shape[0],-1)).view(dtype1)
0135 return a[np.unique(b, return_index=1)[1]]
0136
0137
0138
0139
0140 def np_digest(a):
0141 """
0142 https://stackoverflow.com/questions/5386694/fast-way-to-hash-numpy-objects-for-caching
0143
0144 file digest includes the header, not just the data : so will not match this
0145
0146 """
0147 dig = md5()
0148 data = np.ascontiguousarray(a.view(np.uint8))
0149 dig.update(data)
0150 return dig.hexdigest()
0151
0152
0153 def array_digest(a):
0154 """
0155 https://stackoverflow.com/questions/5386694/fast-way-to-hash-numpy-objects-for-caching
0156
0157 file digest includes the header, not just the data : so will not match this
0158 """
0159 return np_digest(a)
0160
0161
0162 def cus(s):
0163 """
0164 :param s: array of shape (n,)
0165 :return cus: count unique sorted array of shape (num_unique, 2)
0166 """
0167 u,c = np.unique(s, return_counts=True)
0168 o = np.argsort(c)[::-1]
0169 cu = np.zeros( (len(u),2), dtype=np.uint64 )
0170 cu[:,0] = u[o]
0171 cu[:,1] = c[o]
0172 return cu
0173
0174
0175
0176
0177 def count_unique_sorted(vals):
0178 """
0179 Older numpy has problem with the argsort line when cu is empty
0180 """
0181
0182 cu = count_unique(vals)
0183 if len(cu) != 0:
0184 cu = cu[np.argsort(cu[:,1])[::-1]]
0185 pass
0186 return cu.astype(np.uint64)
0187
0188 def vnorm(a):
0189 """
0190 Older numpy lacks the third axis argument form of np.linalg.norm
0191 so this replaces it::
0192
0193 #r = np.linalg.norm(xyz[:,:2], 2, 1)
0194 r = vnorm(xyz[:,:2])
0195
0196 """
0197 return np.sqrt(np.sum(a*a,1))
0198
0199 vnorm_ = lambda _:np.sqrt(np.sum(_*_,1))
0200
0201 costheta_ = lambda a,b:np.sum(a * b, axis = 1)/(vnorm(a)*vnorm(b))
0202
0203
0204
0205
0206 def chi2_pvalue( c2obs, ndf ):
0207 """
0208 :param c2obs: observed chi2 value
0209 :param ndf:
0210 :return pvalue: 1 - _chi2.cdf(c2obs, ndf)
0211
0212 * https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
0213
0214 For ndf:10 you want to know if a chi2 value of 18.307
0215 should be regarded as consistent with a null hypothesis.
0216 By looking at the probability that the chi2 where more than
0217 that::
0218
0219 In [2]: from scipy.stats import chi2 as _chi2
0220 In [27]: 1 - _chi2.cdf(18.307, 10) ## chi2 of 18.307 for ndf:10 : on the edge of being significant deviation
0221 Out[27]: 0.05000058909139815
0222
0223 In [4]: 1-_chi2.cdf(20, 10 ) ## chi2 of 20 for ndf:10 : regarded as significant deviation from null
0224 Out[4]: 0.02925268807696113
0225
0226 A p-value of less than or equal to 0.05 is regarded as evidence of a
0227 statistically significant result, and in these cases, the null hypothesis
0228 should be rejected in favor of the alternative hypothesis.
0229
0230
0231 # probability of getting a chi2 <= c2obs for the ndf
0232
0233 # https://onlinecourses.science.psu.edu/stat414/node/147
0234
0235 :google:`interpret chi2 values`
0236
0237 ::
0238
0239
0240 from scipy.stats import chi2 as _chi2
0241
0242 In [49]: _chi2.cdf( 15.99, 10 ) ## for ndf 10, probability for chi2 < 15.99 is 0.900
0243 Out[49]: 0.90008098002000925
0244
0245 In [53]: _chi2.cdf( range(10,21), 10 ) ## probability of getting chi2 < the value
0246 Out[53]: array([ 0.5595, 0.6425, 0.7149, 0.7763, 0.827 , 0.8679, 0.9004, 0.9256, 0.945 , 0.9597, 0.9707])
0247
0248 In [54]: 1 - _chi2.cdf( range(10,21), 10 ) ## probability of getting chi2 > the value
0249 Out[54]: array([ 0.4405, 0.3575, 0.2851, 0.2237, 0.173 , 0.1321, 0.0996, 0.0744, 0.055 , 0.0403, 0.0293])
0250
0251
0252 * https://en.wikipedia.org/wiki/Chi-squared_distribution
0253
0254 The p-value is the probability of observing a test statistic at least as
0255 extreme in a chi-squared distribution. Accordingly, since the cumulative
0256 distribution function (CDF) for the appropriate degrees of freedom (df) gives
0257 the probability of having obtained a value less extreme than this point,
0258 subtracting the CDF value from 1 gives the p-value. The table below gives a
0259 number of p-values matching to chi2 for the first 10 degrees of freedom.
0260
0261 A low p-value indicates greater statistical significance, i.e. greater
0262 confidence that the observed deviation from the null hypothesis is significant.
0263 A p-value of 0.05 is often used as a cutoff between significant and
0264 not-significant results.
0265
0266 Of course for Opticks-CFG4 comparisons I wish to see no significant
0267 deviations, so I want the p-value to be close to 1 indicating nothing of significance.
0268
0269 ::
0270
0271 In [56]: _chi2.cdf( [3.94,4.87,6.18,7.27,9.34,11.78,13.44,15.99,18.31,23.21,29.59], 10 )
0272 Out[56]: array([ 0.05 , 0.1003, 0.2001, 0.3003, 0.4998, 0.6999, 0.7999, 0.9001, 0.95 , 0.99 , 0.999 ])
0273 ## probability of a chi2 value less than those values for ndf 10
0274
0275 In [57]: 1 - _chi2.cdf( [3.94,4.87,6.18,7.27,9.34,11.78,13.44,15.99,18.31,23.21,29.59], 10 ) ## P-value (Probability)
0276 Out[57]: array([ 0.95 , 0.8997, 0.7999, 0.6997, 0.5002, 0.3001, 0.2001, 0.0999, 0.05 , 0.01 , 0.001 ])
0277 ## probability of a chi2 greater than those values for ndf 10
0278
0279
0280 * ~/opticks_refs/PoissonConsistency.pdf
0281 * http://www.hep.caltech.edu/~fcp/statistics/hypothesisTest/PoissonConsistency/PoissonConsistency.pdf
0282
0283 * https://www.scribbr.com/statistics/chi-square-distributions/
0284 * https://www.scribbr.com/statistics/chi-square-tests/
0285 * https://www.scribbr.com/statistics/chi-square-distribution-table/
0286 * https://www.scribbr.com/wp-content/uploads/2022/05/Chi-square-table.pdf
0287
0288 In [20]: df = np.arange(1,31)
0289
0290 ## so called 5% critical values : to judge null hypotheses for different
0291 ## degrees of freedom
0292 ## Alternatively
0293
0294 In [22]: df = np.arange(1,31)
0295
0296 In [23]: cr5 = _chi2.ppf( 0.95,df)
0297
0298 In [24]: _chi2.cdf( cr5 , df)
0299 Out[24]: array([0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95])
0300
0301 In [25]: np.c_[df,cr5]
0302 Out[25]:
0303 array([[ 1. , 3.841],
0304 [ 2. , 5.991],
0305 [ 3. , 7.815],
0306 [ 4. , 9.488],
0307 [ 5. , 11.07 ],
0308 [ 6. , 12.592],
0309 [ 7. , 14.067],
0310 [ 8. , 15.507],
0311 [ 9. , 16.919],
0312 [10. , 18.307],
0313 [11. , 19.675],
0314 [12. , 21.026],
0315 [13. , 22.362],
0316 [14. , 23.685],
0317 [15. , 24.996],
0318 [16. , 26.296],
0319 [17. , 27.587],
0320
0321
0322 * https://www.nevis.columbia.edu/~seligman/root-class/html/appendix/statistics/ChiSquaredDOF.html
0323
0324 Chi2 critical values
0325
0326 * https://www.itl.nist.gov/div898/handbook/eda/section3/eda3674.htm
0327
0328 """
0329 if _chi2 is None:
0330 return 1
0331
0332 p_value = 1 - _chi2.cdf(c2obs, ndf)
0333 return p_value
0334
0335
0336 def chi2one(a, b):
0337 return (a-b)*(a-b)/(a+b)
0338
0339 def chi2(a_, b_, cut=30):
0340 """
0341 :param a_: array of counts
0342 :param b_: array of counts
0343 :param cut: applied to sum of and and b excludes low counts from the chi2
0344 :return c2,c2n,c2c:
0345
0346 *c2*
0347 array with elementwise square of difference over the sum
0348 (masked by the cut on the sum, zeros provided for low stat entries)
0349 *c2n*
0350 number of bins within the count cut
0351 *c2c*
0352 number of bins not within count cut
0353
0354 ::
0355
0356 c2, c2n, c2c = chi2(a, b)
0357 c2ndf = c2.sum()/c2n
0358
0359 # ChiSquared or KS
0360 # http://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm
0361 # https://en.wikipedia.org/wiki/Propagation_of_uncertainty
0362 # http://stats.stackexchange.com/questions/7400/how-to-assess-the-similarity-of-two-histograms
0363
0364
0365 # http://www.hep.caltech.edu/~fcp/statistics/hypothesisTest/PoissonConsistency/PoissonConsistency.pdf
0366 # ~/opticks_refs/PoissonConsistency.pdf Link now dead ?
0367 # "Testing Consistency of Two Histograms", Frank Porter, California Institute of Technology
0368
0369 """
0370 a = a_.astype(np.float32)
0371 b = b_.astype(np.float32)
0372
0373 msk = a+b > cut
0374 c2 = np.zeros_like(a)
0375 c2[msk] = np.power(a-b,2)[msk]/(a+b)[msk]
0376
0377 c2n = len(a[msk])
0378 c2c = len(a[~msk])
0379
0380 assert c2n + c2c == len(a)
0381
0382 return c2, c2n, c2c
0383
0384 def ratio(a, b):
0385 m = np.logical_and(a > 0, b > 0)
0386
0387 ab = np.zeros((len(a),2))
0388 ba = np.zeros((len(a),2))
0389
0390 ab[m,0] = a[m]/b[m]
0391 ab[m,1] = np.sqrt(a[m])/b[m]
0392
0393 ba[m,0] = b[m]/a[m]
0394 ba[m,1] = np.sqrt(b[m])/a[m]
0395
0396 return ab, ba
0397
0398
0399
0400
0401
0402 def test_count_unique_(fn, a):
0403 """
0404 count_unique appears to go via floats which
0405 looses precision for large numbers
0406 """
0407
0408 aa = np.array([ a,a,a ], dtype=np.uint64 )
0409
0410 cu = fn(aa)
0411 n = cu[0,1]
0412 assert n == 3
0413
0414 a_ = np.uint64(cu[0,0])
0415
0416 ok = a_ == a
0417
0418 if ok:
0419 msg = "OK"
0420 else:
0421 msg = "FAIL"
0422 pass
0423 log.info("test_count_unique_ %16x %16x %s %s %s " % (a, a_, msg, a, a_) )
0424
0425
0426 def test_count_unique():
0427
0428 log.info("test_count_unique")
0429 vals=[0xfedcba9876543210,0xffffffffffffffff]
0430
0431 msk_ = lambda n:(1 << 4*n) - 1
0432 for n in range(16):
0433 msk = msk_(n)
0434 vals.append(msk)
0435
0436 for fn in [count_unique_new, count_unique_old, count_unique_truncating]:
0437 log.info(fn.__name__)
0438 map(lambda v:test_count_unique_(fn, v), vals)
0439
0440
0441 def test_chi2():
0442 a = np.array([0,100,500,500],dtype=np.int32)
0443 b = np.array([0,100,500,0], dtype=np.int32)
0444
0445 c2,c2n,c2c = chi2(a,b, cut=30)
0446
0447
0448 def test_count_unique_2():
0449 log.info("test_count_unique_2")
0450 t = np.array( [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], dtype=np.uint32 )
0451 x = np.array( [[1,1],[2,2],[3,3],[4,4], [5,5]], dtype=np.uint64 )
0452
0453 r1 = count_unique( t )
0454 assert np.all( r1 == x )
0455
0456
0457 def test_count_unique_sorted():
0458 log.info("test_count_unique_sorted")
0459 t = np.array( [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], dtype=np.uint32 )
0460 y = np.array( [[5,5],[4,4],[3,3],[2,2], [1,1]], dtype=np.uint64 )
0461
0462 r = count_unique_sorted( t )
0463 assert np.all( r == y )
0464
0465
0466 def test_count_unique_sorted_empty():
0467 log.info("test_count_unique_sorted_empty np.__version__ %s " % np.__version__ )
0468 t = np.array( [], dtype=np.uint32 )
0469 r = count_unique_sorted( t )
0470
0471 u = np.array( [], dtype=np.uint64 )
0472 c = np.array( [], dtype=np.uint64 )
0473 x = np.vstack( (u,c) ).T
0474
0475 assert np.all( r == x )
0476
0477
0478
0479 if __name__ == '__main__':
0480
0481 logging.basicConfig(level=logging.INFO)
0482 log.info("np.__version__ %s " % np.__version__ )
0483
0484
0485
0486 test_count_unique_2()
0487 test_count_unique_sorted()
0488 test_count_unique_sorted_empty()
0489
0490
0491
0492
0493
0494
0495