File indexing completed on 2026-04-09 07:49:37
0001 #pragma once
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013 #include <cassert>
0014 #include <vector>
0015 #include <string>
0016 #include <cstring>
0017 #include <algorithm>
0018 #include <sstream>
0019 #include <iostream>
0020 #include <iomanip>
0021 #include "NPFold.h"
0022
0023 struct sfreq_matchkey
0024 {
0025 const char* query ;
0026 sfreq_matchkey(const char* query_) : query(query_) {}
0027
0028 bool operator()(const std::pair<std::string, int>& p) const
0029 {
0030 return strcmp(query, p.first.c_str()) == 0 ;
0031 }
0032 };
0033
0034 struct sfreq
0035 {
0036 typedef std::pair<std::string,int> SU ;
0037 typedef typename std::vector<SU> VSU ;
0038 typedef typename VSU::const_iterator IT ;
0039
0040 VSU vsu ;
0041
0042 unsigned get_num() const ;
0043 void get_keys(std::vector<std::string>& keys, int freq_cut) const ;
0044
0045 const char* get_key(unsigned idx) const ;
0046
0047 int get_freq(unsigned idx) const ;
0048 int get_freq(const char* key) const ;
0049
0050 int find_index(const char* key) const ;
0051 void add(const char* key );
0052
0053 bool is_disqualify( const char* key) const ;
0054 void set_disqualify(const char* key) ;
0055 void set_disqualify(const std::vector<std::string>& disqualify) ;
0056
0057
0058 static bool ascending_freq( const SU& a, const SU& b) ;
0059 static bool descending_freq(const SU& a, const SU& b) ;
0060 void sort(bool descending=true);
0061
0062 std::string desc(const char* sub) const ;
0063 std::string desc(unsigned idx) const ;
0064 std::string desc() const ;
0065
0066 static constexpr const char* KEY = "key.npy" ;
0067 static constexpr const char* VAL = "val.npy" ;
0068 size_t get_maxkeylen() const ;
0069 int get_total() const ;
0070
0071 NP* make_key() const ;
0072 NP* make_val() const ;
0073 void import_key_val( const NP* key, const NP* val);
0074
0075 void save(const char* dir) const ;
0076 void save(const char* dir, const char* reldir) const ;
0077
0078 void load(const char* dir);
0079 void load(const char* dir, const char* reldir);
0080
0081 NPFold* serialize() const ;
0082 void import(const NPFold* fold);
0083
0084 };
0085
0086
0087 inline unsigned sfreq::get_num() const
0088 {
0089 return vsu.size();
0090 }
0091
0092 inline void sfreq::get_keys(std::vector<std::string>& keys, int freq_cut) const
0093 {
0094 for(unsigned i=0 ; i < vsu.size() ; i++)
0095 {
0096 const char* key = get_key(i);
0097 int freq = get_freq(i);
0098 if(freq > freq_cut) keys.push_back(key) ;
0099 }
0100 }
0101
0102
0103 inline const char* sfreq::get_key(unsigned idx) const
0104 {
0105 assert( idx < vsu.size() );
0106 return vsu[idx].first.c_str() ;
0107 }
0108 inline int sfreq::get_freq(unsigned idx) const
0109 {
0110 assert( idx < vsu.size() );
0111 return vsu[idx].second ;
0112 }
0113
0114 inline int sfreq::get_freq(const char* key) const
0115 {
0116 int idx = find_index(key);
0117 return idx == -1 ? -1 : int(vsu[idx].second) ;
0118 }
0119
0120 inline int sfreq::find_index(const char* key) const
0121 {
0122 sfreq_matchkey mk(key);
0123 IT it = std::find_if( vsu.begin(), vsu.end(), mk );
0124 return it == vsu.end() ? -1 : std::distance( vsu.begin(), it );
0125 }
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136 inline void sfreq::add(const char* key)
0137 {
0138 int idx = find_index(key);
0139 if( idx == -1 ) vsu.push_back(SU(key, 1u)) ;
0140 else vsu[idx].second += 1 ;
0141 }
0142
0143
0144 inline bool sfreq::is_disqualify(const char* key) const
0145 {
0146 int freq = get_freq(key);
0147 return freq < 0 ;
0148 }
0149
0150 inline void sfreq::set_disqualify(const char* key)
0151 {
0152 if(is_disqualify(key)) return ;
0153 int idx = find_index(key);
0154 assert( idx > -1 && vsu[idx].second > 0 );
0155 vsu[idx].second = -vsu[idx].second ;
0156 }
0157
0158 inline void sfreq::set_disqualify(const std::vector<std::string>& disqualify)
0159 {
0160 for(unsigned i=0 ; i < disqualify.size() ; i++)
0161 {
0162 const char* sub = disqualify[i].c_str();
0163 set_disqualify(sub);
0164 }
0165 }
0166
0167
0168
0169
0170
0171 inline bool sfreq::ascending_freq(const SU& a, const SU& b)
0172 {
0173 return b.second > a.second ;
0174 }
0175 inline bool sfreq::descending_freq(const SU& a, const SU& b)
0176 {
0177 return a.second > b.second ;
0178 }
0179 inline void sfreq::sort(bool descending)
0180 {
0181 std::sort(vsu.begin(), vsu.end(), descending ? descending_freq : ascending_freq );
0182 }
0183
0184 inline std::string sfreq::desc(const char* sub) const
0185 {
0186 int idx = find_index(sub);
0187 return idx > -1 ? desc(unsigned(idx)) : "-" ;
0188 }
0189
0190 inline std::string sfreq::desc(unsigned idx) const
0191 {
0192 std::stringstream ss ;
0193 const SU& su = vsu[idx] ;
0194 const std::string& k = su.first ;
0195 int v = su.second ;
0196 ss << std::setw(5) << idx
0197 << " : "
0198 << std::setw(32) << k.c_str()
0199 << " : "
0200 << std::setw(5) << v
0201 ;
0202 std::string s = ss.str();
0203 return s ;
0204 }
0205
0206 inline std::string sfreq::desc() const
0207 {
0208 int total = get_total();
0209 std::stringstream ss ;
0210 for(unsigned idx=0 ; idx < vsu.size() ; idx++) ss << desc(idx) << std::endl ;
0211 ss << std::setw(5) << "" << " : " << std::setw(32) << "" << " : " << std::setw(5) << total << std::endl ;
0212 std::string s = ss.str();
0213 return s ;
0214 }
0215
0216 inline size_t sfreq::get_maxkeylen() const
0217 {
0218 size_t mx = 0 ;
0219 for(unsigned i=0 ; i < vsu.size() ; i++) mx = std::max(mx, strlen(vsu[i].first.c_str())) ;
0220 return mx ;
0221 }
0222
0223 inline int sfreq::get_total() const
0224 {
0225 int total = 0 ;
0226 for(unsigned i=0 ; i < vsu.size() ; i++) total += std::abs(vsu[i].second) ;
0227 return total ;
0228 }
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249 inline NP* sfreq::make_key() const
0250 {
0251 if(vsu.size()==0) return nullptr ;
0252 size_t mkl = get_maxkeylen() ;
0253 NP* key = NP::Make<char>( vsu.size(), mkl ) ;
0254 char* kdat = key->values<char>();
0255
0256 for(unsigned i=0 ; i < vsu.size() ; i++)
0257 {
0258 const std::pair<std::string, int> su = vsu[i] ;
0259 const char* k = su.first.c_str() ;
0260 for(unsigned j=0 ; j < strlen(k) ; j++) kdat[i*mkl+j] = k[j] ;
0261 }
0262 return key ;
0263 }
0264
0265 inline NP* sfreq::make_val() const
0266 {
0267 if(vsu.size()==0) return nullptr ;
0268 NP* val = NP::Make<int>( vsu.size() ) ;
0269 int* vdat = val->values<int>();
0270
0271 for(unsigned i=0 ; i < vsu.size() ; i++)
0272 {
0273 const std::pair<std::string, int> su = vsu[i] ;
0274 vdat[i] = su.second ;
0275 }
0276 return val ;
0277 }
0278
0279 inline void sfreq::import_key_val( const NP* key, const NP* val)
0280 {
0281 if(key == nullptr || val == nullptr) return ;
0282 assert( key->shape.size() );
0283
0284 unsigned keylen = key->shape[1] ;
0285 const char* kdat = key->cvalues<char>();
0286 const int* vdat = val->cvalues<int>();
0287
0288 assert( key->shape[0] == val->shape[0]) ;
0289 unsigned num_kv = key->shape[0] ;
0290
0291 for(unsigned i=0 ; i < num_kv ; i++)
0292 {
0293 const char* kptr = kdat+i*keylen ;
0294 std::string k(kptr, kptr+keylen) ;
0295 int v = vdat[i] ;
0296 vsu.push_back(std::pair<std::string, int>(k,v) );
0297 }
0298 }
0299
0300
0301
0302 inline void sfreq::save(const char* dir) const
0303 {
0304 if(vsu.size() == 0) return ;
0305
0306 NPFold* fold = serialize() ;
0307 fold->save(dir);
0308 }
0309
0310 inline void sfreq::save(const char* dir, const char* reldir) const
0311 {
0312 if(vsu.size() == 0) return ;
0313 NPFold* fold = serialize() ;
0314 fold->save(dir, reldir);
0315 }
0316
0317 inline void sfreq::load(const char* dir)
0318 {
0319 NPFold* fold = NPFold::Load(dir) ;
0320 import(fold);
0321 }
0322 inline void sfreq::load(const char* dir, const char* reldir)
0323 {
0324 NPFold* fold = NPFold::Load(dir, reldir);
0325 import(fold);
0326 }
0327
0328
0329 inline NPFold* sfreq::serialize() const
0330 {
0331 NPFold* fold = new NPFold ;
0332 fold->add( KEY, make_key() );
0333 fold->add( VAL, make_val() );
0334 return fold ;
0335 }
0336
0337 inline void sfreq::import(const NPFold* fold)
0338 {
0339 const NP* key = fold->get(KEY) ;
0340 const NP* val = fold->get(VAL);
0341 import_key_val(key, val);
0342 }
0343
0344
0345