BFGraph
src
CompressedSequence.hpp
1
#ifndef BFG_COMPRESSED_SEQUENCE_HPP
2
#define BFG_COMPRESSED_SEQUENCE_HPP
3
4
#include <cstring>
5
#include <string>
6
#include <stdint.h>
7
8
#include "Kmer.hpp"
9
10
/* Short description:
11
* - Compress a DNA string by using 2 bits per base instead of 8
12
* - Easily get the DNA string back from the compressed format
13
* - Create a sequence from a kmer
14
* - Get kmers from a sequence
15
* - Get length of a sequence
16
* - Easily get length of matching substring from a given string
17
* */
18
class
CompressedSequence
{
19
20
public
:
21
22
CompressedSequence
();
23
~
CompressedSequence
();
24
25
CompressedSequence
(
const
CompressedSequence
& o);
26
CompressedSequence
& operator=(
const
CompressedSequence
& o);
27
28
// Move constructors
29
CompressedSequence
(
CompressedSequence
&& o);
30
CompressedSequence
& operator=(
CompressedSequence
&& o);
31
32
explicit
CompressedSequence
(
const
char
*s);
33
explicit
CompressedSequence
(
const
string
& s);
34
explicit
CompressedSequence
(
const
Kmer
& km);
35
36
const
char
operator[](
size_t
index)
const
;
37
38
void
clear();
39
40
size_t
size()
const
;
41
42
void
toString(
char
*s,
const
size_t
offset,
const
size_t
length)
const
;
43
string
toString(
const
size_t
offset,
const
size_t
length)
const
;
44
45
46
inline
string
toString()
const
{
return
toString(0,size()); }
47
inline
void
toString(
char
*s)
const
{ toString(s,0,size()); }
48
49
Kmer
getKmer(
size_t
offset)
const
;
50
51
bool
compareKmer(
const
size_t
offset,
const
Kmer
& km)
const
;
52
53
// void setSequence(const CompressedSequence &o, size_t length, size_t offset = 0, bool reversed=false);
54
void
setSequence(
const
CompressedSequence
& o,
const
size_t
start,
const
size_t
length,
const
size_t
offset = 0,
const
bool
reversed =
false
);
55
void
setSequence(
const
char
*s,
const
size_t
length,
const
size_t
offset = 0,
const
bool
reversed =
false
);
56
void
setSequence(
const
string
& s,
const
size_t
length,
const
size_t
offset = 0,
const
bool
reversed=
false
);
57
void
setSequence(
const
Kmer
& km,
const
size_t
length,
const
size_t
offset = 0,
const
bool
reversed=
false
);
58
59
void
reserveLength(
const
size_t
new_length);
60
61
CompressedSequence
rev()
const
;
62
63
size_t
jump(
const
char
*s,
const
size_t
i,
int
pos,
const
bool
reversed)
const
;
64
size_t
bw_jump(
const
char
*s,
const
size_t
i,
int
pos,
const
bool
reversed)
const
;
65
66
int64_t findKmer(
const
Kmer
& km)
const
;
67
68
bool
isShort()
const
;
69
70
private
:
71
72
inline
size_t
round_to_bytes(
const
size_t
len)
const
{
return
(len+3)/4; }
73
void
_resize_and_copy(
const
size_t
new_cap,
const
size_t
copy_limit);
74
void
initShort();
75
void
setSize(
const
size_t
size);
76
77
size_t
capacity()
const
;
78
const
unsigned
char
*getPointer()
const
;
79
80
static
const
uint8_t shortMask = 1;
81
82
union
{
83
84
struct
{
85
uint32_t _length;
// size of sequence
86
uint32_t _capacity;
// capacity of array allocated in bytes
87
unsigned
char
*_data;
// 0-based 2bit compressed dna string
88
unsigned
char
padding[16];
89
} asPointer;
90
91
struct
{
92
uint8_t _size;
// 7 bits can index up to 128
93
unsigned
char
_arr[31];
// can store 124 nucleotides
94
} asBits;
95
};
96
};
97
98
#endif // BFG_COMPRESSED_SEQUENCE_HPP
Kmer
Definition:
Kmer.hpp:31
CompressedSequence
Definition:
CompressedSequence.hpp:18
Generated on Tue Nov 14 2017 13:55:11 for BFGraph by
1.8.13