File indexing completed on 2026-04-09 07:49:47
0001 #pragma once
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052 #include "ssys.h"
0053 #include "sseq.h"
0054 #include "NPX.h"
0055
0056
0057 struct sseq_index_count
0058 {
0059 int index ;
0060 int count ;
0061 std::string desc() const ;
0062 };
0063 inline std::string sseq_index_count::desc() const
0064 {
0065 std::stringstream ss ;
0066 ss << std::setw(7) << count << " : " << std::setw(7) << index ;
0067 std::string str = ss.str();
0068 return str ;
0069 }
0070
0071 struct sseq_index_count_ab
0072 {
0073 sseq_index_count a ;
0074 sseq_index_count b ;
0075 };
0076
0077
0078 struct sseq_unique
0079 {
0080 sseq q ;
0081 sseq_index_count ic ;
0082
0083 bool operator< (const sseq_unique& other) const { return ic.count < other.ic.count ; }
0084 std::string desc() const ;
0085 };
0086
0087 inline std::string sseq_unique::desc() const
0088 {
0089 std::stringstream ss ;
0090 ss << q.seqhis_() << " : " << ic.desc() ;
0091 std::string str = ss.str();
0092 return str ;
0093 }
0094
0095
0096
0097 struct sseq_qab
0098 {
0099 sseq q ;
0100 sseq_index_count a ;
0101 sseq_index_count b ;
0102
0103 double c2(bool& included, int absum_min) const ;
0104
0105 int maxcount() const { return std::max(a.count, b.count) ; }
0106 bool operator< (const sseq_qab& other) const { return maxcount() < other.maxcount() ; }
0107
0108 std::string desc(int absum_min) const ;
0109 };
0110
0111 inline double sseq_qab::c2(bool& included, int absum_min) const
0112 {
0113 double _a = a.count ;
0114 double _b = b.count ;
0115 double absum = _a + _b ;
0116 double abdif = _a - _b ;
0117 included = _a > 0 && _b > 0 && absum > absum_min ;
0118 return included ? abdif*abdif/absum : 0 ;
0119 }
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129 inline std::string sseq_qab::desc(int absum_min) const
0130 {
0131 bool included(false);
0132
0133 std::stringstream ss ;
0134 ss << q.seqhis_()
0135 << " : "
0136 << std::setw(7) << a.count
0137 << std::setw(7) << b.count
0138 << " : "
0139 << std::fixed << std::setw(10) << std::setprecision(4) << c2(included, absum_min)
0140 << " : "
0141 << ( included ? "Y" : "N" )
0142 << " : "
0143 << std::setw(7) << a.index
0144 << std::setw(7) << b.index
0145 ;
0146 std::string str = ss.str();
0147 return str ;
0148 }
0149
0150
0151
0152 struct sseq_index
0153 {
0154 std::vector<sseq> q ;
0155
0156 std::map<sseq, sseq_index_count> m ;
0157
0158 std::vector<sseq_unique> u ;
0159
0160 sseq_index( const NP* seq);
0161
0162 void load_seq( const NP* seq );
0163 void count_unique();
0164 void order_seq();
0165 std::string desc(int min_count=0) const;
0166 };
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180 inline sseq_index::sseq_index( const NP* seq)
0181 {
0182 load_seq(seq);
0183 count_unique();
0184 order_seq();
0185 }
0186
0187
0188 inline void sseq_index::load_seq(const NP* seq)
0189 {
0190 NPX::VecFromArray<sseq>(q, seq );
0191 }
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206 inline void sseq_index::count_unique()
0207 {
0208 for (int i = 0; i < int(q.size()); i++)
0209 {
0210 const sseq& seq = q[i];
0211
0212 std::map<sseq, sseq_index_count>::iterator it = m.find(seq);
0213
0214 if(it == m.end())
0215 {
0216 int q_index_of_first_occurrence = i ;
0217 m[seq] = {q_index_of_first_occurrence, 1} ;
0218 }
0219 else
0220 {
0221 it->second.count++ ;
0222 }
0223 }
0224 }
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236 inline void sseq_index::order_seq()
0237 {
0238 for(auto it=m.begin() ; it != m.end() ; it++) u.push_back( { it->first, {it->second.index, it->second.count} } );
0239
0240 auto descending_order = [](const sseq_unique& a, const sseq_unique& b) { return a.ic.count > b.ic.count ; } ;
0241
0242 std::sort( u.begin(), u.end(), descending_order );
0243 }
0244
0245 inline std::string sseq_index::desc(int min_count) const
0246 {
0247 std::stringstream ss ;
0248 int num = u.size();
0249 ss << "[sseq_index::desc num " << num << std::endl ;
0250 for(int i=0 ; i < num ; i++)
0251 {
0252 if( u[i].ic.count < min_count ) break ;
0253 ss << u[i].desc() << std::endl ;
0254 }
0255 ss << "]sseq_index::desc" << std::endl ;
0256 std::string str = ss.str() ;
0257 return str ;
0258 }
0259
0260
0261 struct sseq_index_ab_chi2
0262 {
0263 static constexpr const char* NAME = "sseq_index_ab_chi2.npy" ;
0264 static const int DEFAULT_ABSUM_MIN = 30 ;
0265 static constexpr const char* EKEY = "sseq_index_ab_chi2_ABSUM_MIN" ;
0266
0267 double sum ;
0268 double ndf ;
0269 double absum_min ;
0270 double spare ;
0271
0272 NP* serialize() const ;
0273 void save(const char* dir) const ;
0274
0275 void init();
0276 std::string desc() const ;
0277 std::string desc_rst() const ;
0278 std::string desc_html() const ;
0279 };
0280
0281
0282 inline NP* sseq_index_ab_chi2::serialize() const
0283 {
0284 NP* a = NP::Make<double>(4) ;
0285 double* aa = a->values<double>();
0286 aa[0] = sum ;
0287 aa[1] = ndf ;
0288 aa[2] = absum_min ;
0289 aa[3] = spare ;
0290 return a ;
0291 }
0292 inline void sseq_index_ab_chi2::save(const char* dir) const
0293 {
0294 NP* a = serialize();
0295 if(dir == nullptr) a->save(NAME);
0296 else a->save(dir, NAME);
0297 }
0298
0299
0300 inline void sseq_index_ab_chi2::init()
0301 {
0302 sum = 0 ;
0303 ndf = 0 ;
0304 absum_min = ssys::getenvint(EKEY,DEFAULT_ABSUM_MIN) ;
0305 spare = 0 ;
0306 }
0307
0308 std::string sseq_index_ab_chi2::desc() const
0309 {
0310 return desc_rst();
0311 }
0312 std::string sseq_index_ab_chi2::desc_rst() const
0313 {
0314 std::stringstream ss ;
0315 ss << "sseq_index_ab_chi2::desc"
0316 << " sum " << std::fixed << std::setw(10) << std::setprecision(4) << sum
0317 << " ndf " << std::setw(7) << ndf
0318 << " sum/ndf " << std::fixed << std::setw(10) << std::setprecision(4) << sum/ndf
0319 << " " << EKEY
0320 << ":"
0321 << absum_min
0322 ;
0323 std::string str = ss.str() ;
0324 return str ;
0325 }
0326
0327 std::string sseq_index_ab_chi2::desc_html() const
0328 {
0329 std::stringstream ss ;
0330 ss
0331 << "[CHI2"
0332 << " <span class=\"r\"> "
0333 << " sum " << std::fixed << std::setw(10) << std::setprecision(4) << sum
0334 << " ndf " << std::setw(7) << ndf
0335 << " sum/ndf " << std::fixed << std::setw(10) << std::setprecision(4) << sum/ndf
0336 << " </span> "
0337 << "]"
0338 ;
0339 std::string str = ss.str() ;
0340 return str ;
0341 }
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353 struct sseq_index_ab
0354 {
0355 static constexpr const char* NAME = "sseq_index_ab.npy" ;
0356 static constexpr const char* _sseq_index_ab__desc_NUM_MAX = "sseq_index_ab__desc_NUM_MAX" ;
0357 static constexpr const char* _sseq_index_ab__desc_HISWID = "sseq_index_ab__desc_HISWID" ;
0358 static constexpr const char* _sseq_index_ab__desc_STYLE = "sseq_index_ab__desc_STYLE" ;
0359
0360 enum { STYLE_NONE, STYLE_RST, STYLE_HTML } ;
0361 static int STYLE();
0362 bool isRST() const { return style == STYLE_RST ; }
0363 bool isHTML() const { return style == STYLE_HTML ; }
0364
0365 const sseq_index& a ;
0366 const sseq_index& b ;
0367 int style ;
0368
0369 std::map<sseq, sseq_index_count_ab> m ;
0370 std::vector<sseq_qab> u ;
0371 sseq_index_ab_chi2 chi2 ;
0372
0373 sseq_index_ab( const sseq_index& a_, const sseq_index& b_ );
0374
0375 void init();
0376 void collect_seq();
0377 void order_seq();
0378 void calc_chi2();
0379 NP* serialize() const ;
0380 void save(const char* dir) const ;
0381
0382 std::string desc(const char* opt) const ;
0383 std::string desc() const ;
0384
0385 };
0386
0387 int sseq_index_ab::STYLE()
0388 {
0389 const char* _STYLE = ssys::getenvvar(_sseq_index_ab__desc_STYLE, "rst") ;
0390 int _style = STYLE_NONE ;
0391 if(0==strcmp(_STYLE, "rst" )) _style = STYLE_RST ;
0392 if(0==strcmp(_STYLE, "html" )) _style= STYLE_HTML ;
0393 assert( _style == STYLE_RST || _style == STYLE_HTML );
0394 return _style ;
0395 }
0396
0397
0398 inline sseq_index_ab::sseq_index_ab( const sseq_index& a_, const sseq_index& b_ )
0399 :
0400 a(a_),
0401 b(b_),
0402 style(STYLE())
0403 {
0404 init();
0405 }
0406
0407 inline void sseq_index_ab::init()
0408 {
0409 collect_seq();
0410 order_seq();
0411 calc_chi2();
0412 }
0413
0414
0415
0416
0417
0418
0419
0420
0421
0422
0423
0424
0425 inline void sseq_index_ab::collect_seq()
0426 {
0427
0428 for (int i = 0; i < int(a.u.size()); i++)
0429 {
0430 const sseq_unique& q_ic = a.u[i];
0431 const sseq& q = q_ic.q ;
0432 std::map<sseq, sseq_index_count_ab>::iterator it = m.find(q);
0433 bool first_q = it == m.end() ;
0434 if(first_q) m[q] = { q_ic.ic , {-1,-1} } ;
0435 }
0436
0437
0438
0439
0440 for (int i = 0; i < int(b.u.size()); i++)
0441 {
0442 const sseq_unique& q_ic = b.u[i];
0443 const sseq& q = q_ic.q ;
0444 std::map<sseq, sseq_index_count_ab>::iterator it = m.find(q);
0445
0446 bool first_q = it == m.end() ;
0447
0448 if(first_q) m[q] = { {-1,-1}, q_ic.ic } ;
0449 else it->second.b = q_ic.ic ;
0450
0451
0452 }
0453 }
0454
0455 inline void sseq_index_ab::order_seq()
0456 {
0457
0458 for(auto it=m.begin() ; it != m.end() ; it++)
0459 {
0460 const sseq& q = it->first ;
0461 const sseq_index_count_ab& ab = it->second ;
0462 u.push_back( {q, ab.a, ab.b } );
0463 }
0464
0465
0466 auto descending_order = [](const sseq_qab& x, const sseq_qab& y) { return x.maxcount() > y.maxcount() ; } ;
0467 std::sort( u.begin(), u.end(), descending_order );
0468 }
0469
0470
0471 inline void sseq_index_ab::calc_chi2()
0472 {
0473 int num = u.size();
0474 chi2.init();
0475 for(int i=0 ; i < num ; i++)
0476 {
0477 const sseq_qab& qab = u[i] ;
0478 bool included = false ;
0479 double c2 = qab.c2(included, chi2.absum_min) ;
0480 if(included)
0481 {
0482 chi2.sum += c2 ;
0483 chi2.ndf += 1 ;
0484 }
0485 }
0486 }
0487
0488
0489 inline NP* sseq_index_ab::serialize() const
0490 {
0491 int num = u.size();
0492 NP* a = NP::Make<int>(num, 4) ;
0493 int* aa = a->values<int>();
0494
0495 for(int i=0 ; i < num ; i++)
0496 {
0497 const sseq_qab& ui = u[i] ;
0498 const sseq& q = ui.q ;
0499
0500 std::string name = q.seqhis_();
0501 a->names.push_back(name);
0502
0503 const sseq_index_count& a = ui.a ;
0504 const sseq_index_count& b = ui.b ;
0505
0506 aa[i*4+0] = a.count ;
0507 aa[i*4+1] = b.count ;
0508 aa[i*4+2] = a.index ;
0509 aa[i*4+3] = b.index ;
0510 }
0511 return a ;
0512 }
0513
0514 inline void sseq_index_ab::save(const char* dir) const
0515 {
0516 NP* a = serialize();
0517 if(dir == nullptr) a->save(NAME);
0518 else a->save(dir, NAME);
0519 }
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533 inline std::string sseq_index_ab::desc(const char* opt) const
0534 {
0535 enum { ALL, AZERO, BZERO, C2INC, C2EXC, DEVIANT, BRIEF } ;
0536 int mode = BRIEF ;
0537 if( strstr(opt,"ALL") ) mode = ALL ;
0538 else if( strstr(opt,"AZERO") ) mode = AZERO ;
0539 else if( strstr(opt,"BZERO") ) mode = BZERO ;
0540 else if( strstr(opt,"C2INC") ) mode = C2INC ;
0541 else if( strstr(opt,"C2EXC") ) mode = C2EXC ;
0542 else if( strstr(opt,"DEVIANT")) mode = DEVIANT ;
0543 else if( strstr(opt,"BRIEF")) mode = BRIEF ;
0544
0545 int u_size = u.size();
0546 int NUM_MAX = ssys::getenvint(_sseq_index_ab__desc_NUM_MAX, 40) ;
0547 int HISWID = ssys::getenvint( _sseq_index_ab__desc_HISWID, 0) ;
0548
0549
0550 int num = 0 ;
0551 if( mode == AZERO || mode == BZERO || mode == DEVIANT)
0552 {
0553
0554 num = u_size ;
0555 }
0556 else
0557 {
0558 num = NUM_MAX > 0 ? std::min( u_size, NUM_MAX ) : u_size ;
0559 }
0560
0561
0562 const char* itab = " " ;
0563
0564 std::stringstream ss ;
0565
0566 if(isRST())
0567 {
0568 ss
0569 << "[sseq_index_ab::desc"
0570 << " u_size " << u_size
0571 << " NUM_MAX " << NUM_MAX
0572 << " num " << num
0573 << " opt " << opt
0574 << " mode " << mode
0575 << ( mode == BRIEF ? chi2.desc_rst() : "" )
0576 << "\n"
0577 ;
0578 }
0579 else if(isHTML())
0580 {
0581 ss
0582 << ".. raw:: html\n"
0583 << "\n"
0584 << itab << "<pre class=\"mypretiny\">\n"
0585 << itab << "[sseq_index_ab::desc"
0586 << " " << ( mode == BRIEF ? "" : opt )
0587 << " " << ( mode == BRIEF ? chi2.desc_html() : "" )
0588 << "\n"
0589 ;
0590 }
0591
0592
0593 for(int i=0 ; i < num ; i++)
0594 {
0595 const sseq_qab& qab = u[i] ;
0596
0597 bool c2_inc(false);
0598 double c2 = qab.c2(c2_inc, chi2.absum_min);
0599
0600 bool a_zero = qab.a.count <= 0 && qab.b.count > 10 ;
0601 bool b_zero = qab.b.count <= 0 && qab.a.count > 10 ;
0602 bool deviant = c2 > 10. ;
0603
0604 bool selected = true ;
0605 switch(mode)
0606 {
0607 case ALL: selected = true ; break ;
0608 case BRIEF: selected = i < 60 ; break ;
0609 case AZERO: selected = a_zero ; break ;
0610 case BZERO: selected = b_zero ; break ;
0611 case C2INC: selected = c2_inc ; break ;
0612 case C2EXC: selected = !c2_inc ; break ;
0613 case DEVIANT: selected = deviant ; break ;
0614 }
0615
0616 const char* head ;
0617 const char* tail ;
0618 const char* pdev ;
0619
0620 if(isRST())
0621 {
0622 head = deviant ? ":r:`" : " " ;
0623 tail = deviant ? "`" : " " ;
0624 pdev = "DEVIANT " ;
0625 }
0626 else if(isHTML())
0627 {
0628 head = " " ;
0629 tail = " " ;
0630 pdev = "<span class=\"r\">DEVIANT</span> " ;
0631 }
0632
0633 std::string seqhis = qab.q.seqhis_();
0634 std::string u_seqhis = HISWID == 0 ? seqhis : seqhis.substr(0,HISWID) ;
0635
0636
0637 if(selected) ss
0638 << head
0639 << u_seqhis
0640 << " : "
0641 << std::setw(7) << qab.a.count
0642 << std::setw(7) << qab.b.count
0643 << " : "
0644 << std::fixed << std::setw(10) << std::setprecision(4) << c2
0645 << " : "
0646 << ( c2_inc ? "Y" : "N" )
0647 << " : "
0648 << std::setw(7) << qab.a.index
0649 << std::setw(7) << qab.b.index
0650 << " : "
0651 << ( a_zero ? "AZERO " : "" )
0652 << ( b_zero ? "BZERO " : "" )
0653 << ( deviant ? pdev : "" )
0654 << ( c2_inc ? " " : "C2EXC " )
0655 << tail
0656 << "\n"
0657 ;
0658 }
0659 ss
0660 << ( isHTML() ? itab : "" )
0661 << "]sseq_index_ab::desc\n"
0662 << ( isHTML() ? itab : "" )
0663 << ( isHTML() ? "</pre>\n" : "" )
0664 ;
0665
0666
0667 std::string str = ss.str();
0668 return str ;
0669 }
0670
0671
0672 inline std::string sseq_index_ab::desc() const
0673 {
0674 std::stringstream ss ;
0675 bool html = isHTML();
0676 ss
0677 << ( html ? "" : "AB" )
0678 << "\n"
0679 << desc("BRIEF")
0680 << "\n"
0681 << ( html ? "" : "AB" )
0682 << "\n"
0683 << desc("AZERO")
0684 << "\n"
0685 << ( html ? "" : "AB" )
0686 << "\n"
0687 << desc("BZERO")
0688 << "\n"
0689 << ( html ? "" : "AB" )
0690 << "\n"
0691 << desc("DEVIANT")
0692 << "\n"
0693 ;
0694
0695 std::string str = ss.str();
0696 return str ;
0697 }
0698
0699
0700