Skip to content

Commit

Permalink
fixup
Browse files Browse the repository at this point in the history
  • Loading branch information
sakria9 committed Mar 18, 2024
1 parent 21f7e33 commit aac09b6
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 26 deletions.
9 changes: 8 additions & 1 deletion artifact/get_func_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,16 @@ def get_func_name(func, file):
and not x.__contains__("(table (;")
and not x.__contains__("global.get")
]

import_count = len([
x
for x in output
if x.__contains__("(import ")
])

for i in range(len(output1)):
if i == func:
return output1[i].split(" ")[3]
return output1[i].split(" ")[3] + "-funcid-" + str(i - import_count)

if __name__ == "__main__":
import sys
Expand Down
1 change: 1 addition & 0 deletions src/checkpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ int main(int argc, char *argv[]) {
wamr->instantiate();
wamr->get_int3_addr();
wamr->replace_int3_with_nop();
wamr->replace_mfence_with_nop();

// get current time
auto start = std::chrono::high_resolution_clock::now();
Expand Down
67 changes: 52 additions & 15 deletions src/profile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@ std::vector<std::unique_ptr<WAMRExecEnv>> as;
std::mutex as_mtx;
long snapshot_memory = 0;

std::vector<std::vector<size_t>> stack_record;
std::vector<std::vector<std::pair<size_t, size_t>>> stack_record;
void unwind(WASMExecEnv *instance) {
auto cur_frame = (AOTFrame *)instance->cur_frame;
std::vector<size_t> stack;
auto ip = cur_frame->ip_offset;
std::vector<std::pair<size_t, size_t>> stack;
while (cur_frame != nullptr) {
auto func_index = cur_frame->func_index;
stack.emplace_back(func_index);
auto ip = cur_frame->ip_offset;
stack.emplace_back(func_index, ip);
cur_frame = cur_frame->prev_frame;
}
stack_record.emplace_back(stack);
Expand Down Expand Up @@ -159,8 +161,8 @@ int main(int argc, char *argv[]) {
std::atomic<bool> enable_send_sigint{false};
auto send_sigint_thread = std::thread([&]() {
while (true) {
// sample every 50ms
std::this_thread::sleep_for(std::chrono::milliseconds(50));
// sample every 10ms
std::this_thread::sleep_for(std::chrono::milliseconds(10));
if (enable_send_sigint.load())
kill(getpid(), SIGINT);
}
Expand Down Expand Up @@ -188,14 +190,17 @@ int main(int argc, char *argv[]) {
SPDLOG_INFO("Execution time: {} s", dur.count() / 1000000.0);

std::map<size_t, size_t> func_count, last_func_count;
std::map<size_t, std::map<size_t, size_t>> ip_count_per_func;
std::map<size_t, std::string> func_name;
std::map<size_t, size_t> aot_idx;
for (const auto &stack : stack_record) {
if (stack.empty())
continue;
last_func_count[stack[0]]++;
for (auto f : stack) {
last_func_count[stack[0].first]++;
for (auto [f, ip] : stack) {
func_count[f]++;
func_name[f] = "";
ip_count_per_func[f][ip]++;
}
}

Expand Down Expand Up @@ -224,24 +229,56 @@ int main(int argc, char *argv[]) {
for (const auto &e : func_idx) {
std::string name;
ss >> name;
size_t idx;
ss >> idx;
func_name[e] = name;
aot_idx[e] = idx;
}
pclose(pipe);

// print the result
std::cout << "Last level function called count\n"
<< "--------------------------------\n"
<< std::endl;
for (const auto &e : last_func_count) {
std::cout << std::format("{} {}\n", func_name[e.first], e.second);
<< "--------------------------------\n";
std::vector<size_t> last_func_idx;
std::transform(last_func_count.begin(), last_func_count.end(), std::back_inserter(last_func_idx),
[](const std::pair<size_t, size_t> &p) { return p.first; });
std::sort(last_func_idx.begin(), last_func_idx.end(),
[&last_func_count](size_t a, size_t b) { return last_func_count[a] > last_func_count[b]; });
for (const auto &e : last_func_idx) {
std::cout << std::format("{} {}\n", func_name[e], last_func_count[e]);
std::cout << "IP count:\n";
for (auto [ip, cnt] : ip_count_per_func[e]) {
std::cout << std::format("func {} ip {} count {}\n", e, ip, cnt);
}
}
std::cout << std::endl;
std::cout << "Total function called count\n"
<< "--------------------------\n"
<< std::endl;
for (const auto &e : func_count) {
std::cout << std::format("{} {}\n", func_name[e.first], e.second);
<< "--------------------------\n";
std::sort(func_idx.begin(), func_idx.end(),
[&func_count](size_t a, size_t b) { return func_count[a] > func_count[b]; });
for (const auto &e : func_idx) {
std::cout << std::format("{} {}\n", func_name[e], func_count[e]);
}

std::ofstream out(target + ".pgo");
size_t total_sample_count = stack_record.size();
std::vector<std::pair<size_t, size_t>> pgo_list;
for (const auto &e : last_func_idx) {
for (auto [ip, cnt] : ip_count_per_func[e]) {
auto freq = (double)cnt / (double)total_sample_count;
if (freq > 0.15) {
std::cout << std::format("pgo name {} idx {} ip {} freq {}\n", func_name[e], aot_idx[e], ip, freq);
pgo_list.emplace_back(aot_idx[e], ip);
} else {
std::cout << std::format("no pgo name {} idx {} ip {} freq {}\n", func_name[e], aot_idx[e], ip, freq);
}
}
}
out << pgo_list.size() << std::endl;
for (const auto &e : pgo_list) {
out << e.first << " " << e.second << std::endl;
}
std::cout << "PGO file has been written to " << target + ".pgo" << std::endl;

return 0;
}
29 changes: 20 additions & 9 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ set(HOST_PREFIX ${WASI_SDK_DIR}/bin/)
enable_testing()
function(wamr_app input)
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
COMMAND ${HOST_PREFIX}clang -fopenmp=libomp -lomp -Wno-implicit-function-declaration -O3 -Wno-int-conversion --target=wasm32-wasi-threads -lwasi-emulated-getpid -lwasi-emulated-process-clocks -D_WASI_EMULATED_SIGNAL -D_WASI_EMULATED_PROCESS_CLOCKS -g -pthread -Wl,--max-memory=3355443200 -z stack-size=65536 -Wl,--export-all -Wl,--allow-undefined -Wl,--allow-undefined-file=${WASI_SDK_DIR}/share/wasi-sysroot/share/wasm32-wasi-threads/undefined-symbols.txt --sysroot=${HOST_PREFIX}/../share/wasi-sysroot ${WAMR_DIR}/samples/socket-api/wasm-app-prefix/src/wasm-app-build/libsocket_wasi_ext.a -I${WAMR_DIR}/core/iwasm/libraries/lib-socket/inc ${CMAKE_CURRENT_SOURCE_DIR}/${input}.c -o ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
# single thread
COMMAND ${HOST_PREFIX}clang -Wno-implicit-function-declaration -Wno-int-conversion --target=wasm32-wasi-threads -lwasi-emulated-getpid -lwasi-emulated-process-clocks -lwasi-emulated-mman -D_WASI_EMULATED_MMAN -D_WASI_EMULATED_SIGNAL -D_WASI_EMULATED_PROCESS_CLOCKS -g -fopenmp=libomp -lomp -pthread -Wl,--max-memory=3355443200 -z stack-size=65536 -Wl,--allow-undefined -Wl,--export-all -Wl,--allow-undefined-file=${WASI_SDK_DIR}/share/wasi-sysroot/share/wasm32-wasi-threads/undefined-symbols.txt --sysroot=${HOST_PREFIX}/../share/wasi-sysroot ${WAMR_DIR}/samples/socket-api/wasm-app-prefix/src/wasm-app-build/libsocket_wasi_ext.a -I${WAMR_DIR}/core/iwasm/libraries/lib-socket/inc ${CMAKE_CURRENT_SOURCE_DIR}/${input}.c -o ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
# openmp multi-thread
# COMMAND ${HOST_PREFIX}clang -fopenmp=libomp -lomp -Wno-implicit-function-declaration -O3 -Wno-int-conversion --target=wasm32-wasi-threads -lwasi-emulated-getpid -lwasi-emulated-process-clocks -D_WASI_EMULATED_SIGNAL -D_WASI_EMULATED_PROCESS_CLOCKS -g -pthread -Wl,--max-memory=3355443200 -z stack-size=65536 -Wl,--export-all -Wl,--allow-undefined -Wl,--allow-undefined-file=${WASI_SDK_DIR}/share/wasi-sysroot/share/wasm32-wasi-threads/undefined-symbols.txt --sysroot=${HOST_PREFIX}/../share/wasi-sysroot ${WAMR_DIR}/samples/socket-api/wasm-app-prefix/src/wasm-app-build/libsocket_wasi_ext.a -I${WAMR_DIR}/core/iwasm/libraries/lib-socket/inc ${CMAKE_CURRENT_SOURCE_DIR}/${input}.c -o ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${input}.c
)
add_custom_target(${input}_wamr ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm)
Expand Down Expand Up @@ -38,17 +41,23 @@ function(wamr_app input)
)
add_custom_target(${input}_ckpt_loop_compile ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop.aot)

add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-dirty.aot
COMMAND ${CMAKE_SOURCE_DIR}/lib/wasm-micro-runtime/wamr-compiler/build/wamrc --disable-aux-stack-check --enable-loop-checkpoint --enable-aux-stack-dirty-bit -o ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-dirty.aot ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-counter.aot
COMMAND ${CMAKE_SOURCE_DIR}/lib/wasm-micro-runtime/wamr-compiler/build/wamrc --disable-aux-stack-check --enable-counter-loop-checkpoint -o ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-counter.aot ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
)
add_custom_target(${input}_ckpt_loop_dirty_compile ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-dirty.aot)
add_custom_target(${input}_ckpt_loop_counter_compile ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-counter.aot)

add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-br.aot
COMMAND ${CMAKE_SOURCE_DIR}/lib/wasm-micro-runtime/wamr-compiler/build/wamrc --disable-aux-stack-check --enable-br-checkpoint -o ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-br.aot ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
)
add_custom_target(${input}_ckpt_br_compile ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-br.aot)
# add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-dirty.aot
# COMMAND ${CMAKE_SOURCE_DIR}/lib/wasm-micro-runtime/wamr-compiler/build/wamrc --disable-aux-stack-check --enable-loop-checkpoint --enable-aux-stack-dirty-bit -o ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-dirty.aot ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
# DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
# )
# add_custom_target(${input}_ckpt_loop_dirty_compile ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-loop-dirty.aot)

# add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-br.aot
# COMMAND ${CMAKE_SOURCE_DIR}/lib/wasm-micro-runtime/wamr-compiler/build/wamrc --disable-aux-stack-check --enable-br-checkpoint -o ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-br.aot ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
# DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}.wasm
# )
# add_custom_target(${input}_ckpt_br_compile ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${input}-ckpt-br.aot)

add_test(NAME ${input}_checkpoint COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../MVVM_checkpoint --target ${CMAKE_CURRENT_BINARY_DIR}/${input}.aot)
add_test(NAME ${input}_restore COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../MVVM_restore --target ${CMAKE_CURRENT_BINARY_DIR}/${input}.aot)
Expand All @@ -59,6 +68,8 @@ function(wamr_app input)
endfunction()

wamr_app(counter)
wamr_app(vadd)
wamr_app(sgemm)
wamr_app(read-file)
wamr_app(gups)
wamr_app(gemm)
Expand Down
49 changes: 49 additions & 0 deletions test/sgemm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include <stdio.h>
#include <stdlib.h>

void sgemm(int m, int n, int k, const float *__restrict__ A, const float *__restrict__ B, float *__restrict__ C) {
int i, j, p;
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
float cij = C[i * n + j];
for (p = 0; p < k; p++) {
cij += A[i * k + p] * B[p * n + j];
}
C[i * n + j] = cij;
}
}
}

void init(float *matrix, int row, int column) {
for (int j = 0; j < column; j++) {
for (int i = 0; i < row; i++) {
matrix[j * row + i] = (double)(rand());
}
}
}

int main(int argc, char *argv[]) {
int rowsA, colsB, common;
int i;

rowsA = 512;
colsB = 512;
common = 512;

float *A = (float *)malloc(rowsA * common * sizeof(float));
float *B = (float *)malloc(common * colsB * sizeof(float));
float *C = (float *)malloc(rowsA * colsB * sizeof(float));

init(A, rowsA, common);
init(B, common, colsB);
for (i = 0; i < 10; i++) {
sgemm(rowsA, colsB, common, A, B, C);
}

printf("%f\n", C[0]);
free(A);
free(B);
free(C);

return 0;
}
28 changes: 28 additions & 0 deletions test/vadd.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <stdlib.h>
#include <stdio.h>

#define N 1000000

void vector_add(const float* __restrict__ a, const float * __restrict__ b, float* __restrict__ c) {
for (int i = 0; i < N; i++) {
c[i] = a[i] + b[i];
}
}

int main() {
float *a, *b, *c;
a = (float*)malloc(N * sizeof(float));
b = (float*)malloc(N * sizeof(float));
c = (float*)malloc(N * sizeof(float));

for (int i = 0; i < 10; i++) {
vector_add(a, b, c);
}

fprintf(stderr, "c[0] = %f\n", c[0]);

free(a);
free(b);
free(c);
return 0;
}

0 comments on commit aac09b6

Please sign in to comment.