Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:37

0001 #pragma once
0002 /**
0003 sfreq.h : count occurrence frequencies of strings and sorts by frequencies
0004 ============================================================================
0005 
0006 Canonical usage is for geometry progeny digests 
0007 
0008 * subs are collected by stree::classifySubtrees
0009 
0010 
0011 **/
0012 
0013 #include <cassert>
0014 #include <vector>
0015 #include <string>
0016 #include <cstring>
0017 #include <algorithm>
0018 #include <sstream>
0019 #include <iostream>
0020 #include <iomanip>
0021 #include "NPFold.h"
0022 
0023 struct sfreq_matchkey 
0024 {
0025     const char* query ; 
0026     sfreq_matchkey(const char* query_) : query(query_) {}
0027 
0028     bool operator()(const std::pair<std::string, int>& p) const 
0029     { 
0030         return strcmp(query, p.first.c_str()) == 0 ;
0031     }
0032 };
0033 
0034 struct sfreq
0035 {
0036     typedef std::pair<std::string,int> SU ;   
0037     typedef typename std::vector<SU>        VSU  ; 
0038     typedef typename VSU::const_iterator    IT ;   
0039 
0040     VSU vsu ; 
0041 
0042     unsigned get_num() const ; 
0043     void get_keys(std::vector<std::string>& keys, int freq_cut) const ; 
0044 
0045     const char* get_key(unsigned idx) const ; 
0046 
0047     int get_freq(unsigned idx) const ; 
0048     int get_freq(const char* key) const ; 
0049 
0050     int find_index(const char* key) const ; 
0051     void add(const char* key ); 
0052 
0053     bool is_disqualify( const char* key) const ; 
0054     void set_disqualify(const char* key) ; 
0055     void set_disqualify(const std::vector<std::string>& disqualify) ; 
0056 
0057 
0058     static bool ascending_freq( const SU& a, const SU& b) ; 
0059     static bool descending_freq(const SU& a, const SU& b) ; 
0060     void sort(bool descending=true);  
0061 
0062     std::string desc(const char* sub) const ; 
0063     std::string desc(unsigned idx) const ; 
0064     std::string desc() const ; 
0065 
0066     static constexpr const char* KEY = "key.npy" ; 
0067     static constexpr const char* VAL = "val.npy" ; 
0068     size_t get_maxkeylen() const ; 
0069     int get_total() const ; 
0070 
0071     NP* make_key() const ; 
0072     NP* make_val() const ; 
0073     void import_key_val( const NP* key, const NP* val); 
0074 
0075     void save(const char* dir) const ; 
0076     void save(const char* dir, const char* reldir) const ; 
0077 
0078     void load(const char* dir); 
0079     void load(const char* dir, const char* reldir); 
0080 
0081     NPFold* serialize() const ; 
0082     void import(const NPFold* fold); 
0083 
0084 };
0085 
0086 
0087 inline unsigned sfreq::get_num() const
0088 {
0089     return vsu.size();  
0090 } 
0091 
0092 inline void sfreq::get_keys(std::vector<std::string>& keys, int freq_cut) const
0093 {
0094     for(unsigned i=0 ; i < vsu.size() ; i++)
0095     {
0096         const char* key = get_key(i); 
0097         int freq  = get_freq(i); 
0098         if(freq > freq_cut) keys.push_back(key) ;  
0099     }
0100 }
0101 
0102 
0103 inline const char* sfreq::get_key(unsigned idx) const
0104 {
0105     assert( idx < vsu.size() ); 
0106     return vsu[idx].first.c_str() ; 
0107 }
0108 inline int sfreq::get_freq(unsigned idx) const
0109 {
0110     assert( idx < vsu.size() ); 
0111     return vsu[idx].second ; 
0112 }
0113 
0114 inline int sfreq::get_freq(const char* key) const
0115 {
0116     int idx = find_index(key); 
0117     return idx == -1 ? -1 : int(vsu[idx].second) ; 
0118 }
0119 
0120 inline int sfreq::find_index(const char* key) const 
0121 {
0122     sfreq_matchkey mk(key);  
0123     IT it = std::find_if( vsu.begin(), vsu.end(), mk ); 
0124     return it == vsu.end() ? -1 : std::distance( vsu.begin(), it ); 
0125 }
0126 
0127 /**
0128 sfreq::add : maybe "add_unique_count" would be a better name 
0129 ---------------------------------------------------------------
0130 
0131 Adding sub keys and counts to a vector of pairs::
0132 
0133    (subtree_digest, freq)
0134 
0135 **/
0136 inline void sfreq::add(const char* key)
0137 {
0138     int idx = find_index(key); 
0139     if( idx == -1 ) vsu.push_back(SU(key, 1u)) ; 
0140     else vsu[idx].second += 1 ;  
0141 }
0142 
0143 
0144 inline bool sfreq::is_disqualify(const char* key) const 
0145 {
0146    int freq = get_freq(key); 
0147    return freq < 0 ;    
0148 }
0149 
0150 inline void sfreq::set_disqualify(const char* key)
0151 {
0152     if(is_disqualify(key)) return ; 
0153     int idx = find_index(key); 
0154     assert( idx > -1 && vsu[idx].second > 0 ); 
0155     vsu[idx].second = -vsu[idx].second ; 
0156 }
0157 
0158 inline void sfreq::set_disqualify(const std::vector<std::string>& disqualify)
0159 {
0160     for(unsigned i=0 ; i < disqualify.size() ; i++)
0161     {
0162         const char* sub = disqualify[i].c_str(); 
0163         set_disqualify(sub); 
0164     }
0165 }
0166 
0167 
0168 
0169 
0170 
0171 inline bool sfreq::ascending_freq(const SU& a, const SU& b)  // static
0172 {
0173     return b.second > a.second ;
0174 }
0175 inline bool sfreq::descending_freq(const SU& a, const SU& b) // static 
0176 {
0177     return a.second > b.second ;
0178 }
0179 inline void sfreq::sort(bool descending) 
0180 {
0181     std::sort(vsu.begin(), vsu.end(), descending ? descending_freq : ascending_freq );
0182 }
0183 
0184 inline std::string sfreq::desc(const char* sub) const 
0185 {
0186     int idx = find_index(sub); 
0187     return idx > -1 ? desc(unsigned(idx)) : "-" ; 
0188 }
0189 
0190 inline std::string sfreq::desc(unsigned idx) const 
0191 {
0192     std::stringstream ss ; 
0193     const SU& su = vsu[idx] ;  
0194     const std::string& k = su.first ; 
0195     int v = su.second ; 
0196     ss << std::setw(5) << idx 
0197        << " : " 
0198        << std::setw(32) << k.c_str()
0199        << " : " 
0200        << std::setw(5) << v
0201        ;  
0202     std::string s = ss.str(); 
0203     return s ; 
0204 }
0205 
0206 inline std::string sfreq::desc() const 
0207 {
0208     int total = get_total(); 
0209     std::stringstream ss ; 
0210     for(unsigned idx=0 ; idx < vsu.size() ; idx++) ss << desc(idx) << std::endl ; 
0211     ss << std::setw(5) << "" << " : " << std::setw(32) << "" << " : " << std::setw(5) << total << std::endl ;  
0212     std::string s = ss.str(); 
0213     return s ; 
0214 }
0215 
0216 inline size_t sfreq::get_maxkeylen() const 
0217 {
0218     size_t mx = 0 ; 
0219     for(unsigned i=0 ; i < vsu.size() ; i++) mx = std::max(mx, strlen(vsu[i].first.c_str())) ; 
0220     return mx ;  
0221 }
0222 
0223 inline int sfreq::get_total() const
0224 {
0225     int total = 0 ; 
0226     for(unsigned i=0 ; i < vsu.size() ; i++) total += std::abs(vsu[i].second) ; 
0227     return total ; 
0228 }
0229 
0230 
0231 
0232 
0233 
0234 /**
0235 sfreq::make_key
0236 -----------------
0237 
0238 HMM: this uses an awkward approach of using a char array.
0239 Alternate would be to just use set_names on the val array.  
0240 
0241 In [5]: t.key.view("|S5").ravel()
0242 Out[5]: array([b'blue', b'red', b'green'], dtype='|S5')
0243 
0244 In [6]: t.key.shape
0245 Out[6]: (3, 5)
0246 **/
0247 
0248 
0249 inline NP* sfreq::make_key() const 
0250 {
0251     if(vsu.size()==0) return nullptr ; 
0252     size_t mkl = get_maxkeylen() ; 
0253     NP* key = NP::Make<char>( vsu.size(), mkl ) ;
0254     char* kdat = key->values<char>(); 
0255 
0256     for(unsigned i=0 ; i < vsu.size() ; i++)
0257     {
0258         const std::pair<std::string, int> su = vsu[i] ;  
0259         const char* k = su.first.c_str() ; 
0260         for(unsigned j=0 ; j < strlen(k) ; j++) kdat[i*mkl+j] = k[j] ; 
0261     }
0262     return key ; 
0263 }
0264 
0265 inline NP* sfreq::make_val() const 
0266 {
0267     if(vsu.size()==0) return nullptr ; 
0268     NP* val = NP::Make<int>( vsu.size() ) ; 
0269     int* vdat = val->values<int>(); 
0270 
0271     for(unsigned i=0 ; i < vsu.size() ; i++)
0272     {
0273         const std::pair<std::string, int> su = vsu[i] ;  
0274         vdat[i] = su.second ; 
0275     }
0276     return val ;
0277 } 
0278 
0279 inline void sfreq::import_key_val( const NP* key, const NP* val)
0280 {
0281     if(key == nullptr || val == nullptr) return ; 
0282     assert( key->shape.size() ); 
0283 
0284     unsigned keylen = key->shape[1] ; 
0285     const char* kdat = key->cvalues<char>(); 
0286     const int* vdat = val->cvalues<int>(); 
0287 
0288     assert( key->shape[0] == val->shape[0]) ; 
0289     unsigned num_kv = key->shape[0] ; 
0290  
0291     for(unsigned i=0 ; i < num_kv ; i++)
0292     {
0293         const char* kptr = kdat+i*keylen ; 
0294         std::string k(kptr, kptr+keylen) ; 
0295         int v = vdat[i] ; 
0296         vsu.push_back(std::pair<std::string, int>(k,v) );  
0297     }
0298 }
0299 
0300 
0301 
0302 inline void sfreq::save(const char* dir) const 
0303 {
0304     if(vsu.size() == 0) return ; 
0305 
0306     NPFold* fold = serialize() ; 
0307     fold->save(dir); 
0308 }
0309 
0310 inline void sfreq::save(const char* dir, const char* reldir) const 
0311 {
0312     if(vsu.size() == 0) return ; 
0313     NPFold* fold = serialize() ; 
0314     fold->save(dir, reldir); 
0315 }
0316 
0317 inline void sfreq::load(const char* dir)
0318 {
0319     NPFold* fold = NPFold::Load(dir) ; 
0320     import(fold); 
0321 }
0322 inline void sfreq::load(const char* dir, const char* reldir)
0323 {
0324     NPFold* fold = NPFold::Load(dir, reldir); 
0325     import(fold); 
0326 }
0327 
0328 
0329 inline NPFold* sfreq::serialize() const 
0330 {
0331     NPFold* fold = new NPFold ; 
0332     fold->add( KEY, make_key() ); 
0333     fold->add( VAL, make_val() ); 
0334     return fold ; 
0335 }
0336 
0337 inline void sfreq::import(const NPFold* fold)
0338 {
0339     const NP* key = fold->get(KEY) ; 
0340     const NP* val = fold->get(VAL);  
0341     import_key_val(key, val); 
0342 }
0343 
0344 
0345