Bloom Filter Trie
Node.h
1 #pragma once
2 
3 /* ===================================================================================================================================
4 * INCLUDES AND DEFINES
5 * ===================================================================================================================================
6 */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdint.h>
11 #include <inttypes.h>
12 #include <math.h>
13 #include <limits.h>
14 #include <string.h>
15 
16 #include <Judy.h>
17 
18 #include <omp.h>
19 
20 #include "default_param.h"
21 #include "useful_macros.h"
22 #include "UC.h"
23 #include "xxhash.h"
24 
25 /* ===================================================================================================================================
26 * STRUCTURES DECLARATION
27 * ===================================================================================================================================
28 */
29 
30 // info_per_level is a structure which contains pointers on macro used to manipulate the field children_type of a CC.
31 // The structure is used in an array, initialized in create_info_per_level()
32 typedef struct{
33  int nb_ucs_skp;
34  int nb_kmers_uc; //Number of prefixes a CC, at a specific level of the tree, can contain.
35  int mask_shift_kmer; //Suffixes are encoded in arrays of 8bits cells: mask_shift_kmer covers only the bits used on the last cell
36  int size_kmer_in_bytes; //Size of the suffixes represented at a given level of the tree, in bytes
37  int size_kmer_in_bytes_minus_1; //Size of the suffixes represented at a given level of the tree, minus the size of the prefixes ps, in bytes
38  int nb_bits_per_cell_skip_filter2;
39  int nb_bits_per_cell_skip_filter3;
40  int nb_bytes_per_cell_skip_filter2;
41  int nb_bytes_per_cell_skip_filter3;
42  int modulo_hash;
43  int tresh_suf_pref;
44  int level_min;
45  int root;
46 } info_per_level;
47 
48 // A node is a list of containers of two types: Compressed Containers (CC) and Uncompressed Containers (UC).
49 // It can contain 0 or more CCs in CC_array, plus always one UC which can be empty (UC_array.substrings == NULL) or not.
50 typedef struct {
51  void* CC_array;
52  UC UC_array;
53 } __attribute__ ((__packed__)) Node;
54 
55 //resultPresence is a structure produced by presenceKmer(). It contains information about the presence of a prefix p into a given node.
56 typedef struct{
57  void* node;
58  void* container; //Ptr to the container (UC or CC) which contain the prefix p or cc->children that contain the substring we are looking for
59  void* link_child; //Ptr to the container (Node or uint8_t*) having potentially the suffix linked to the prefix p
60 
61  int level_node;
62  int pos_container;
63 
64  int bucket; //Position of the array containing the suffixes linked to prefix p in children
65  int pos_sub_bucket; // Position (in term of suffix+annotation) of the first suffix linked to prefix p in children[bucket]
66 
67  int children_type_leaf; //Boolean indicating that container is a leaf of the tree
68  int container_is_UC; //Boolean indicating if container is a UC
69 
70  int posFilter2; //position of p in filter2 (where it is or where it should be) or size of suffix
71  int posFilter3; //position of p in filter3 (where it is or where it should be) or position of match in link_child
72  int pos_extra_filter3; //position of p in extra_filter3 (where it is or where it should be) or size of suffix
73 
74  int pos_children;
75  int count_children;
76  int count_nodes;
77 
78  uint8_t substring[SIZE_BYTES_SUF_PREF]; // the prefix p
79 
80  uint8_t presBF; //Boolean indicating if p is said present in the Bloom Filter
81  uint8_t presFilter2; //Boolean indicating if p_u is present in the Second Filter
82  uint8_t presFilter3; //Boolean indicating if p_v is present in the Third Filter
83 
84 } resultPresence;
85 
91 typedef struct {
92  char** filenames;
94  uint64_t* hash_v;
95 
96  uint16_t** skip_sp;
97 
98  annotation_array_elem* comp_set_colors;
99 
100  info_per_level* info_per_lvl;
101 
102  annotation_inform* ann_inf;
103 
104  resultPresence* res;
105 
106  int k;
107  int r1;
108  int r2;
110  int length_comp_set_colors;
113  uint8_t compressed;
114  uint8_t marked;
115 
116  Node node;
117 } BFT_Root;
118 
124 typedef struct{
125  char* kmer;
126  uint8_t* kmer_comp;
127  resultPresence* res;
128 } BFT_kmer;
129 
130 /* ===================================================================================================================================
131 * INLINE FUNCTIONS DECLARATION
132 * ===================================================================================================================================
133 */
134 
135 inline bool are_genomes_ids_overlapping(BFT_Root* root_1, BFT_Root* root_2);
136 inline uint64_t* create_hash_v_array(int rand_seed1, int rand_seed2);
137 inline Node* createNode(void);
138 inline void initiateNode(Node* node);
139 inline resultPresence* create_resultPresence();
140 inline void initialize_resultPresence(resultPresence* res);
141 
142 inline bool are_genomes_ids_overlapping(BFT_Root* root_1, BFT_Root* root_2){
143 
144  ASSERT_NULL_PTR(root_1,"get_overlap_genomes_ids()\n")
145  ASSERT_NULL_PTR(root_2,"get_overlap_genomes_ids()\n")
146 
147  if (strcmp(root_1->filenames[root_1->nb_genomes-1], root_2->filenames[0]) == 0)
148  return true;
149 
150  return false;
151 }
152 
153 inline uint64_t* create_hash_v_array(int rand_seed1, int rand_seed2){
154 
155  int j, nb_bits;
156  uint32_t nb_hash_v = pow(4, NB_CHAR_SUF_PREF);
157 
158  uint64_t* hash_v = malloc(nb_hash_v * 2 * sizeof(uint64_t));
159  ASSERT_NULL_PTR(hash_v, "create_hash_v_array()")
160 
161  uint8_t gen_sub[SIZE_BYTES_SUF_PREF];
162 
163  for (uint32_t i = 0; i < nb_hash_v; i++){
164 
165  nb_bits = NB_CHAR_SUF_PREF * 2;
166 
167  for (j = 0; j < SIZE_BYTES_SUF_PREF; j++){
168 
169  nb_bits -= SIZE_BITS_UINT_8T;
170 
171  if (nb_bits >= 0) gen_sub[j] = (i >> nb_bits) & 0xff;
172  else gen_sub[j] = (i << (-nb_bits)) & 0xff;
173  }
174 
175  hash_v[i * 2] = XXH64(gen_sub, SIZE_BYTES_SUF_PREF, rand_seed1);
176  hash_v[i * 2 + 1] = XXH64(gen_sub, SIZE_BYTES_SUF_PREF, rand_seed2);
177  }
178 
179  return hash_v;
180 }
181 
182 /* ---------------------------------------------------------------------------------------------------------------
183 * createNode()
184 * ---------------------------------------------------------------------------------------------------------------
185 * Allocate and initialize a node
186 * ---------------------------------------------------------------------------------------------------------------
187 * ---------------------------------------------------------------------------------------------------------------
188 */
189 inline Node* createNode(void){
190  Node* node = malloc(sizeof(Node));
191  ASSERT_NULL_PTR(node,"createNode()")
192 
193  node->CC_array = NULL;
194  initializeUC(&(node->UC_array));
195 
196  return node;
197 }
198 
199 /* ---------------------------------------------------------------------------------------------------------------
200 * initiateNode(node)
201 * ---------------------------------------------------------------------------------------------------------------
202 * Initialize a node
203 * ---------------------------------------------------------------------------------------------------------------
204 * node: a pointer on a Node structure
205 * ---------------------------------------------------------------------------------------------------------------
206 */
207 inline void initiateNode(Node* node){
208 
209  ASSERT_NULL_PTR(node,"initiateNode()")
210 
211  node->CC_array = NULL;
212  initializeUC(&(node->UC_array));
213 
214  return;
215 }
216 
217 inline resultPresence* create_resultPresence(){
218 
219  resultPresence* res = calloc(1,sizeof(resultPresence));
220  ASSERT_NULL_PTR(res,"create_resultPresence()")
221 
222  res->node = NULL;
223  res->container = NULL;
224  res->link_child = NULL;
225  res->posFilter2 = INT_MAX;
226  res->posFilter3 = INT_MAX;
227  res->pos_extra_filter3 = INT_MAX;
228 
229  return res;
230 }
231 
232 inline void initialize_resultPresence(resultPresence* res){
233 
234  ASSERT_NULL_PTR(res,"initialize_resultPresence()")
235 
236  res->node = NULL;
237  res->container = NULL;
238  res->link_child = NULL;
239  res->pos_container = 0;
240  res->level_node = 0;
241  res->bucket = 0;
242  res->pos_sub_bucket = 0;
243  res->presBF = 0;
244  res->presFilter2 = 0;
245  res->presFilter3 = 0;
246  res->count_children = 0;
247  res->count_nodes = 0;
248  res->pos_children = 0;
249  res->children_type_leaf = 0;
250  res->container_is_UC = 0;
251  res->posFilter2 = INT_MAX;
252  res->posFilter3 = INT_MAX;
253  res->pos_extra_filter3 = INT_MAX;
254 
255  return;
256 }
char ** filenames
Inserted genome file names.
Definition: Node.h:92
int treshold_compression
Color compression is triggered every BFT_Root::treshold_compression genome inserted.
Definition: Node.h:111
int nb_genomes
Number of genomes inserted.
Definition: Node.h:109
resultPresence * res
Contains information about the location of BFT_kmer::kmer in a BFT_Root.
Definition: Node.h:127
int k
Size of k-mers.
Definition: Node.h:106
uint8_t * kmer_comp
2 bits encoded form of BFT_kmer::kmer.
Definition: Node.h:126
K-mer stored in a BFT_Root.
Definition: Node.h:124
char * kmer
ASCII null-terminated k-mer.
Definition: Node.h:125
Root vertex of a BFT.
Definition: Node.h:91