Warning, /acts/thirdparty/Annoy/0001-Modify-annoy.patch is written in an unsupported language. File is not indexed.
0001 From 16f4711d53903e2541dd518122d52e11e8569908 Mon Sep 17 00:00:00 2001
0002 From: Jeremy Couthures <jeremy.couthures@lapp.in2p3.fr>
0003 Date: Wed, 10 Jul 2024 10:31:31 +0200
0004 Subject: [PATCH] Add AngularEuclidean metric, get_seed function and
0005 optimizations
0006
0007 ---
0008 src/annoylib.h | 118 ++++++++++++++++++++++++++++++++++++++-----------
0009 1 file changed, 93 insertions(+), 25 deletions(-)
0010
0011 diff --git a/src/annoylib.h b/src/annoylib.h
0012 index 492d9ca..5c83601 100644
0013 --- a/src/annoylib.h
0014 +++ b/src/annoylib.h
0015 @@ -12,6 +12,10 @@
0016 // License for the specific language governing permissions and limitations under
0017 // the License.
0018
0019 +/*
0020 + * Note: This file has been modified to meet the needs of the ACTS projects and
0021 + * differs from the original provided by Spotify AB.
0022 + */
0023
0024 #ifndef ANNOY_ANNOYLIB_H
0025 #define ANNOY_ANNOYLIB_H
0026 @@ -128,7 +132,11 @@ inline void set_error_from_errno(char **error, const char* msg) {
0027 annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
0028 if (error) {
0029 *error = (char *)malloc(256); // TODO: win doesn't support snprintf
0030 - snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
0031 +#if defined(_MSC_VER) || defined(__MINGW32__)
0032 + sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno);
0033 +#else
0034 + snprintf(*error, 256, "%s: %s (%d)", msg, strerror(errno), errno);
0035 +#endif
0036 }
0037 }
0038
0039 @@ -201,6 +209,35 @@ inline T euclidean_distance(const T* x, const T* y, int f) {
0040 return d;
0041 }
0042
0043 +template <typename T>
0044 +inline T angular_euclidean_distance(const T *x, const T *y, int f) {
0045 + /// First dimension is an angle, so we need to be careful about periodicity.
0046 + /// Here we assume that the angle is in radians, and that the difference is
0047 + /// less than 2pi. Other dimensions are regular Euclidean.
0048 +
0049 + /// Angular part
0050 + T tmp = std::fabs(*x - *y);
0051 +
0052 + /// If the difference is larger than pi, then we can get a smaller distance by
0053 + /// going the other way around the circle.
0054 + if (tmp > M_PI) {
0055 + tmp = 2 * M_PI - tmp;
0056 + }
0057 +
0058 + T d = tmp * tmp;
0059 + ++x;
0060 + ++y;
0061 +
0062 + /// Euclidean part
0063 + for (int i = 1; i < f; ++i) {
0064 + tmp = *x - *y;
0065 + d += tmp * tmp;
0066 + ++x;
0067 + ++y;
0068 + }
0069 + return d;
0070 +}
0071 +
0072 #ifdef ANNOYLIB_USE_AVX
0073 // Horizontal single sum of 256bit vector.
0074 inline float hsum256_ps_avx(__m256 v) {
0075 @@ -412,13 +449,13 @@ inline void two_means(const vector<Node*>& nodes, int f, Random& random, bool co
0076
0077 struct Base {
0078 template<typename T, typename S, typename Node>
0079 - static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {
0080 + static inline void preprocess(void* /*unused*/, size_t /*unused*/, const S /*unused*/, const int /*unused*/) {
0081 // Override this in specific metric structs below if you need to do any pre-processing
0082 // on the entire set of nodes passed into this index.
0083 }
0084
0085 template<typename Node>
0086 - static inline void zero_value(Node* dest) {
0087 + static inline void zero_value(Node* /*unused*/) {
0088 // Initialize any fields that require sane defaults within this node.
0089 }
0090
0091 @@ -547,7 +584,7 @@ struct DotProduct : Angular {
0092 }
0093
0094 template<typename S, typename T>
0095 - static inline void init_node(Node<S, T>* n, int f) {
0096 + static inline void init_node(Node<S, T>* /*unused*/, int /*unused*/) {
0097 }
0098
0099 template<typename T, typename Node>
0100 @@ -671,17 +708,17 @@ struct Hamming : Base {
0101 return dist;
0102 }
0103 template<typename S, typename T>
0104 - static inline bool margin(const Node<S, T>* n, const T* y, int f) {
0105 + static inline bool margin(const Node<S, T>* n, const T* y, int /*unused*/) {
0106 static const size_t n_bits = sizeof(T) * 8;
0107 T chunk = n->v[0] / n_bits;
0108 return (y[chunk] & (static_cast<T>(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0;
0109 }
0110 template<typename S, typename T, typename Random>
0111 - static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
0112 + static inline bool side(const Node<S, T>* n, const T* y, int f, Random& /*unused*/) {
0113 return margin(n, y, f);
0114 }
0115 template<typename S, typename T, typename Random>
0116 - static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
0117 + static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t /*unused*/, Random& random, Node<S, T>* n) {
0118 size_t cur_size = 0;
0119 size_t i = 0;
0120 int dim = f * 8 * sizeof(T);
0121 @@ -720,7 +757,7 @@ struct Hamming : Base {
0122 return distance;
0123 }
0124 template<typename S, typename T>
0125 - static inline void init_node(Node<S, T>* n, int f) {
0126 + static inline void init_node(Node<S, T>* /*unused*/, int /*unused*/) {
0127 }
0128 static const char* name() {
0129 return "hamming";
0130 @@ -784,7 +821,7 @@ struct Euclidean : Minkowski {
0131 return sqrt(std::max(distance, T(0)));
0132 }
0133 template<typename S, typename T>
0134 - static inline void init_node(Node<S, T>* n, int f) {
0135 + static inline void init_node(Node<S, T>* /*unused*/, int /*unused*/) {
0136 }
0137 static const char* name() {
0138 return "euclidean";
0139 @@ -792,6 +829,35 @@ struct Euclidean : Minkowski {
0140
0141 };
0142
0143 +struct AngularEuclidean : Minkowski {
0144 + template <typename S, typename T>
0145 + static inline T distance(const Node<S, T> *x, const Node<S, T> *y, int f) {
0146 + return angular_euclidean_distance(x->v, y->v, f);
0147 + }
0148 + template <typename S, typename T, typename Random>
0149 + static inline void create_split(const vector<Node<S, T> *> &nodes, int f,
0150 + size_t s, Random &random,
0151 + Node<S, T> *n) {
0152 + Node<S, T> *p = (Node<S, T> *)alloca(s);
0153 + Node<S, T> *q = (Node<S, T> *)alloca(s);
0154 + two_means<T, Random, AngularEuclidean, Node<S, T>>(nodes, f, random, false,
0155 + p, q);
0156 +
0157 + for (int z = 0; z < f; z++)
0158 + n->v[z] = p->v[z] - q->v[z];
0159 + Base::normalize<T, Node<S, T>>(n, f);
0160 + n->a = 0.0;
0161 + for (int z = 0; z < f; z++)
0162 + n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
0163 + }
0164 + template <typename T> static inline T normalized_distance(T distance) {
0165 + return sqrt(std::max(distance, T(0)));
0166 + }
0167 + template <typename S, typename T>
0168 + static inline void init_node(Node<S, T> * /*unused*/, int /*unused*/) {}
0169 + static const char *name() { return "angular_euclidean"; }
0170 +};
0171 +
0172 struct Manhattan : Minkowski {
0173 template<typename S, typename T>
0174 static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
0175 @@ -815,7 +881,7 @@ struct Manhattan : Minkowski {
0176 return std::max(distance, T(0));
0177 }
0178 template<typename S, typename T>
0179 - static inline void init_node(Node<S, T>* n, int f) {
0180 + static inline void init_node(Node<S, T>* /*unused*/, int /*unused*/) {
0181 }
0182 static const char* name() {
0183 return "manhattan";
0184 @@ -841,6 +907,7 @@ class AnnoyIndexInterface {
0185 virtual void verbose(bool v) = 0;
0186 virtual void get_item(S item, T* v) const = 0;
0187 virtual void set_seed(R q) = 0;
0188 + virtual R get_seed() const = 0;
0189 virtual bool on_disk_build(const char* filename, char** error=NULL) = 0;
0190 };
0191
0192 @@ -869,14 +936,14 @@ public:
0193 #endif
0194
0195 protected:
0196 - const int _f;
0197 - size_t _s;
0198 + const unsigned int _f;
0199 + size_t _s; // Size of each node
0200 S _n_items;
0201 void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate
0202 S _n_nodes;
0203 S _nodes_size;
0204 vector<S> _roots;
0205 - S _K;
0206 + S _K; // Max number of descendants to fit into node
0207 R _seed;
0208 bool _loaded;
0209 bool _verbose;
0210 @@ -885,7 +952,7 @@ protected:
0211 bool _built;
0212 public:
0213
0214 - AnnoyIndex(int f) : _f(f), _seed(Random::default_seed) {
0215 + AnnoyIndex(unsigned int f) : _f(f), _seed(Random::default_seed) {
0216 _s = offsetof(Node, v) + _f * sizeof(T); // Size of each node
0217 _verbose = false;
0218 _built = false;
0219 @@ -896,7 +963,7 @@ public:
0220 unload();
0221 }
0222
0223 - int get_f() const {
0224 + unsigned int get_f() const {
0225 return _f;
0226 }
0227
0228 @@ -911,6 +978,9 @@ public:
0229 return false;
0230 }
0231 _allocate_size(item + 1);
0232 + if (item >= _n_items)
0233 + _n_items = item + 1;
0234 +
0235 Node* n = _get(item);
0236
0237 D::zero_value(n);
0238 @@ -919,14 +989,11 @@ public:
0239 n->children[1] = 0;
0240 n->n_descendants = 1;
0241
0242 - for (int z = 0; z < _f; z++)
0243 + for (unsigned int z = 0; z < _f; z++)
0244 n->v[z] = w[z];
0245
0246 D::init_node(n, _f);
0247
0248 - if (item >= _n_items)
0249 - _n_items = item + 1;
0250 -
0251 return true;
0252 }
0253
0254 @@ -976,8 +1043,7 @@ public:
0255 // This way we can load them faster without reading the whole file
0256 _allocate_size(_n_nodes + (S)_roots.size());
0257 for (size_t i = 0; i < _roots.size(); i++)
0258 - memcpy(_get(_n_nodes + (S)i), _get(_roots[i]), _s);
0259 - _n_nodes += _roots.size();
0260 + memcpy(_get(_n_nodes++), _get(_roots[i]), _s);
0261
0262 if (_verbose) annoylib_showUpdate("has %d nodes\n", _n_nodes);
0263
0264 @@ -1119,10 +1185,10 @@ public:
0265
0266 // Find the roots by scanning the end of the file and taking the nodes with most descendants
0267 _roots.clear();
0268 - S m = -1;
0269 + S m = (S)-1;
0270 for (S i = _n_nodes - 1; i >= 0; i--) {
0271 S k = _get(i)->n_descendants;
0272 - if (m == -1 || k == m) {
0273 + if (m == (S)-1 || k == m) {
0274 _roots.push_back(i);
0275 m = k;
0276 } else {
0277 @@ -1175,6 +1241,8 @@ public:
0278 _seed = seed;
0279 }
0280
0281 + R get_seed() const { return _seed; }
0282 +
0283 void thread_build(int q, int thread_idx, ThreadedBuildPolicy& threaded_build_policy) {
0284 // Each thread needs its own seed, otherwise each thread would be building the same tree(s)
0285 Random _random(_seed + thread_idx);
0286 @@ -1332,7 +1400,7 @@ protected:
0287 children_indices[1].clear();
0288
0289 // Set the vector to 0.0
0290 - for (int z = 0; z < _f; z++)
0291 + for (unsigned int z = 0; z < _f; z++)
0292 m->v[z] = 0;
0293
0294 for (size_t i = 0; i < indices.size(); i++) {
0295 @@ -1425,7 +1493,7 @@ protected:
0296 class AnnoyIndexSingleThreadedBuildPolicy {
0297 public:
0298 template<typename S, typename T, typename D, typename Random>
0299 - static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexSingleThreadedBuildPolicy>* annoy, int q, int n_threads) {
0300 + static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexSingleThreadedBuildPolicy>* annoy, int q, int /*unused*/) {
0301 AnnoyIndexSingleThreadedBuildPolicy threaded_build_policy;
0302 annoy->thread_build(q, 0, threaded_build_policy);
0303 }
0304 --
0305 2.34.1
0306