Skip to content

Commit

Permalink
initial implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
QuantumFelidae committed Dec 5, 2024
1 parent 5f3cbc6 commit 62556b7
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 3 deletions.
13 changes: 13 additions & 0 deletions include/data_gen.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <vector>
#include <string>


namespace wfa {

std::vector<std::string> generate_sequences(int32_t length, int32_t count);

std::vector<std::pair<std::string, std::string>> modify_sequences(int32_t length, int32_t count, double error_rate);

}
37 changes: 37 additions & 0 deletions include/kokkos_simd.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include "kokkos_simd.hpp"

#include <string>
#include <string_view>
#include <vector>
#include <array>
#include <unordered_map>

namespace wfa {
//using wavefront_t = std::vector<std::array<std::vector<int32_t>, 3>>;
int32_t constexpr match = 2;
int32_t constexpr ins = 0;
int32_t constexpr del = 1;

struct wavefront_t {
std::vector<std::array<std::vector<int32_t>, 3>> data;
std::unordered_map<int32_t, int32_t> score_to_index;
std::vector<std::array<int32_t, 2>> low_hi;

int32_t lookup(int32_t score, int32_t column, int32_t row);
int32_t wave_size(int32_t score, bool low);

};

//bounded score lookup
//int32_t bn_s(wavefront_t& wavefront, int32_t score, int32_t column, int32_t row);

bool extend(wavefront_t& wavefront, std::string_view a, std::string_view b, int32_t score);

void next(wavefront_t& wavefront, int32_t s, int32_t x, int32_t o, int32_t e);

int32_t wavefront_simd(std::string_view a, std::string_view b, int32_t x, int32_t o, int32_t e);


}
4 changes: 2 additions & 2 deletions include/naive.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace wfa {
* @param e The cost for extending a gap in the alignment.
* @return int The alignment cost between the two sequences.
*/
int naive(std::string_view a, std::string_view b, int x, int o, int e);
int32_t naive(std::string_view a, std::string_view b, int32_t x, int32_t o, int32_t e);

/**
* @brief Computes the alignment cost between two sequences using a wavefront alignment approach.
Expand All @@ -33,6 +33,6 @@ namespace wfa {
* @param e The cost for extending a gap in the alignment.
* @return int The alignment cost between the two sequences.
*/
int wavefront(std::string_view a, std::string_view b, int x, int o, int e);
int32_t wavefront(std::string_view a, std::string_view b, int32_t x, int32_t o, int32_t e);
}

4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
set(WFA_SRC_FILES ${WFA_SRC_FILES}
"src/naive.cpp"
"src/kokkos_simd.cpp"
"src/data_gen.cpp"
PARENT_SCOPE
)

set(WFA_HEADER_FILES ${WFA_HEADER_FILES}
"include/naive.hpp"
"include/kokkos_simd.hpp"
"include/data_gen.hpp"
PARENT_SCOPE
)
45 changes: 45 additions & 0 deletions src/data_gen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include "include/data_gen.hpp"
#include <random>
#include <array>

constexpr std::array<char, 4> nt = { 'A', 'T', 'G', 'C' };

std::vector<std::string> wfa::generate_sequences(int32_t length, int32_t count) {
std::vector<std::string> strings;

std::random_device r;
std::default_random_engine random(r());
std::uniform_int_distribution<int32_t> int_rand(0, 3);

for (int32_t i = 0; i < count; ++i) {
std::string str;
for (int32_t j = 0; j < length; ++j) {
str.push_back(nt[int_rand(random)]);
}
strings.emplace_back(std::move(str));
}

return strings;
}

std::vector<std::pair<std::string, std::string>> wfa::modify_sequences(int32_t length, int32_t count, double error_rate) {
auto strings = generate_sequences(length, count);

std::random_device r;
std::default_random_engine random(r());
std::uniform_int_distribution<int32_t> int_rand(0, 3);
std::uniform_real_distribution<double> real_rand(0, 1);

std::vector<std::pair<std::string, std::string>> sequences;
for (auto& str : strings) {
std::string modified = str;
for (char& c : modified) {
if (real_rand(random) < error_rate) {
c = nt[int_rand(random)];
}
}
sequences.emplace_back(std::pair{str, modified});
}

return sequences;
}
129 changes: 129 additions & 0 deletions src/kokkos_simd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#include "include/kokkos_simd.hpp"

int32_t wfa::wavefront_t::lookup(int32_t score, int32_t column, int32_t row) {
if (score < 0) {
return 0;
}
if (row < 0) {
return 0;
}
auto iter = score_to_index.find(score);
if (iter == score_to_index.end()) {
return 0;
}
if (row > data[iter->second][column].size() - 1) {
return 0;
}
return (data[iter->second][column][row]);
}

int32_t wfa::wavefront_t::wave_size(int32_t score, bool low) {
auto iter = score_to_index.find(score);
if (iter == score_to_index.end()) {
return 0;
}
auto& pair = low_hi[iter->second];
if (low) {
return pair[0];
}
return pair[1];
}

//int32_t wfa::bn_s(wavefront_t& wavefront, int32_t score, int32_t column, int32_t row) {
// if (score < 0) {
// return 0;
// }
// return wavefront[score][column][row];
//}

bool wfa::extend(wavefront_t& wavefront, std::string_view a, std::string_view b, int32_t score) {
std::vector<int32_t>& matchfront_back = wavefront.data.back()[2];
int32_t k_low = wavefront.wave_size(score, true);
int32_t k_high = wavefront.wave_size(score, false);
for (int32_t k = k_low; k < k_high + 1; ++k) {
int32_t starting_index = matchfront_back[k - k_low];
int32_t v = starting_index - k;
int32_t h = starting_index;
bool mismatch = false;
while (not mismatch) {
const char v_c = a.at(v);
const char h_c = b.at(h);
if (v_c == h_c) {
if (v == static_cast<int32_t>(a.size()) - 1 and h == static_cast<int32_t>(b.size()) - 1) {
return true;
}
++starting_index;
++v;
++h;
}
else {
mismatch = true;
}
}
matchfront_back[k - k_low] = starting_index;
}
return false;
}

void wfa::next(wavefront_t& wavefront, int32_t s, int32_t x, int32_t o, int32_t e) {
int32_t m_high_sx = 0;
int32_t m_low_sx = 0;
int32_t m_high_soe = 0;
int32_t m_low_soe = 0;
int32_t i_high_se = 0;
int32_t i_low_se = 0;
int32_t d_high_se = 0;
int32_t d_low_se = 0;
if (s - x >= 0) {
m_high_sx = wavefront.wave_size(s - x, false);
m_low_sx = wavefront.wave_size(s - x, true);
}
if (s - o - e >= 0) {
m_high_soe = wavefront.wave_size(s - o - e, false);
m_low_soe = wavefront.wave_size(s - o - e, true);
}
if (s - e >= 0) {
i_high_se = wavefront.wave_size(s - e, false);
i_low_se = wavefront.wave_size(s - e, true);
d_high_se = wavefront.wave_size(s - e, false);
d_low_se = wavefront.wave_size(s - e, true);
}

int32_t high = std::max({m_high_sx, m_high_soe, i_high_se, d_high_se}) + 1;
int32_t low = std::min({m_low_sx, m_low_soe, i_low_se, d_low_se}) - 1;

auto& low_hi_cur = wavefront.low_hi.emplace_back();
low_hi_cur[0] = low;
low_hi_cur[1] = high;

auto& wave_cur = wavefront.data.emplace_back();
wavefront.score_to_index.emplace(s, static_cast<int32_t>(wavefront.data.size()) - 1);
for (auto& vec : wave_cur) {
vec.resize(high - low + 1);
}
for (int32_t k = low; k < high + 1; ++k) {
wave_cur[ins][k - low] = std::max({wavefront.lookup(s - o - e, match, k - 1 - low), wavefront.lookup(s - e, ins, k - 1 - low)}) + 1;
wave_cur[del][k - low] = std::max({ wavefront.lookup(s - o - e, match, k + 1 - low), wavefront.lookup(s - e, del, k + 1 - low) });
wave_cur[match][k - low] = std::max({ wave_cur[ins][k - low], wave_cur[del][k - low], wavefront.lookup(s - x, match, k - low) + 1});
}
}

int32_t wfa::wavefront_simd(std::string_view a, std::string_view b, int32_t x, int32_t o, int32_t e) {
wavefront_t wavefront;
wavefront.data.emplace_back(std::array<std::vector<int32_t>, 3>{std::vector<int32_t>{0}, std::vector<int32_t>{0}, std::vector<int32_t>{0}});

wavefront.low_hi.emplace_back(std::array{0,0});
wavefront.score_to_index.emplace(0, 0);
bool matched = false;

int32_t score = 0;
while (not matched) {
matched = extend(wavefront, a, b, score);
if (matched) {
break;
}
score = score + 1;
next(wavefront, score, x, o, e);
}
return score;
}
13 changes: 12 additions & 1 deletion src/wfa_tool.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
#include "include/data_gen.hpp"
#include "include/kokkos_simd.hpp"

int main() {
return 0;
/*auto sequences = wfa::modify_sequences(100, 1000, 0.25);
for (const auto& pair : sequences) {
int32_t dist = wfa::wavefront_simd(pair.first, pair.second, 4, 6, 2);
}*/

std::string a = "tert";
std::string b = "test";
wfa::wavefront_simd(a, b, 4, 6, 2);
}

0 comments on commit 62556b7

Please sign in to comment.