Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add latency estimations to simulator #21

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 31 additions & 15 deletions src/cgra.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import copy
import numpy as np
from ctypes import c_int32
import csv
import os.path

from characterization import display_characterization, get_latency_cc
from kernels import *
from memory import *

# CGRA from left to right, top to bottom
N_ROWS = 4
N_COLS = 4
INSTR_SIZE = N_ROWS+1
MAX_COL = N_COLS - 1
MAX_ROW = N_ROWS - 1

BUS_TYPES = ["ONE-TO-M", "N-TO-M", "INTERLEAVED"]
PRINT_OUTS = 1

MAX_32b = 0xFFFFFFFF
Expand Down Expand Up @@ -47,8 +49,8 @@ def print_out( prs, outs, insts, ops, reg ):
elif pr == "R1" : pnt = reg[1]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why isn't latency or energy one of these options?

elif pr == "R2" : pnt = reg[2]
elif pr == "R3" : pnt = reg[3]

out_string += "["
if pnt != []:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and if pnt == []? The for should still be executed? What was the idea behind it?

out_string += "["
for i in range(len(pnt)):
out_string += "{{{}:4}}".format(i)
if i == (len(pnt) - 1):
Expand All @@ -60,7 +62,7 @@ def print_out( prs, outs, insts, ops, reg ):


class CGRA:
def __init__( self, kernel, memory, read_addrs, write_addrs):
def __init__( self, kernel, memory, read_addrs, write_addrs, memory_manager):
self.cells = []
for r in range(N_ROWS):
list = []
Expand All @@ -71,6 +73,11 @@ def __init__( self, kernel, memory, read_addrs, write_addrs):
self.memory = memory
self.instr2exec = 0
self.cycles = 0
self.N_COLS = N_COLS
self.N_ROWS = N_ROWS
self.total_latency_cc = 0
self.instr_latency_cc = []
self.max_latency_instr = None
JuanSapriza marked this conversation as resolved.
Show resolved Hide resolved
if read_addrs is not None and len(read_addrs) == N_COLS:
self.load_addr = read_addrs
else:
Expand All @@ -79,6 +86,8 @@ def __init__( self, kernel, memory, read_addrs, write_addrs):
self.store_addr = write_addrs
else:
self.store_addr = [0]*N_COLS
self.init_store = copy.copy(self.store_addr)
self.memory_manager = memory_manager
self.exit = False

def run( self, pr, limit ):
Expand Down Expand Up @@ -109,10 +118,12 @@ def step( self, prs="ROUT" ):
ops = [ self.cells[r][i].op for i in range(N_COLS) ]
reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ]
print_out( prs, outs, insts, ops, reg )

self.flag_poll_cnt = 0
get_latency_cc(self)
self.instr2exec += 1
self.cycles += 1
return self.exit
return self.exit


def get_neighbour_address( self, r, c, dir ):
n_r = r
Expand Down Expand Up @@ -181,6 +192,8 @@ def __init__( self, parent, row, col ):
self.regs = {'R0':0, 'R1':0, 'R2':0, 'R3':0 }
self.op = ""
self.instr = ""
self.latency_cc = 0
self.addr = 0

def get_out( self ):
return self.old_out
Expand Down Expand Up @@ -222,7 +235,6 @@ def run_instr( self, instr):
self.op = instr[0]
except:
self.op = instr

if self.op in self.ops_arith:
des = instr[1]
val1 = self.fetch_val( instr[2] )
Expand Down Expand Up @@ -251,25 +263,27 @@ def run_instr( self, instr):

elif self.op in self.ops_lwd:
des = instr[1]
self.addr = self.parent.load_addr[self.col]
ret = self.parent.load_direct( self.col, 4 )
if des in self.regs: self.regs[des] = ret
self.out = ret

elif self.op in self.ops_swd:
val = self.fetch_val( instr[1] )
self.addr = self.parent.store_addr[self.col]
self.parent.store_direct( self.col, val, 4 )

elif self.op in self.ops_lwi:
des = instr[1]
addr = self.fetch_val( instr[2] )
ret = self.parent.load_indirect(addr)
self.addr = self.fetch_val( instr[2] )
ret = self.parent.load_indirect(self.addr)
if des in self.regs: self.regs[des] = ret
self.out = ret

elif self.op in self.ops_swi:
addr = self.fetch_val( instr[2] )
self.addr = self.fetch_val( instr[2] )
val = self.fetch_val( instr[1] )
self.parent.store_indirect( addr, val )
self.parent.store_indirect( self.addr, val )
pass

elif self.op in self.ops_nop:
Expand Down Expand Up @@ -375,7 +389,9 @@ def blt( self, val1, val2, branch ):
ops_jump = { 'JUMP' : '' }
ops_exit = { 'EXIT' : '' }

def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None):


def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None, memory_manager=MEMORY()):
ker = []
mem = []

Expand All @@ -400,12 +416,12 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=
return None

# Run the kernel
cgra = CGRA(ker, mem, load_addrs, store_addrs)
cgra = CGRA(ker, mem, load_addrs, store_addrs, memory_manager)
mem = cgra.run(pr, limit)

# Store the output sorted
sorted_mem = sorted(mem, key=lambda x: x[0])
with open( kernel + "/"+FILENAME_MEM_O+version+EXT, 'w+') as f:
for row in sorted_mem: csv.writer(f).writerow(row)

display_characterization(cgra, pr)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if a user does not want to print the characterization?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pr parameter takes in the different types of information the user wants to display. If a user doesn't want to print anything, he can do so by leaving the pr string array empty from the notebook.

print("\n\nEND")
179 changes: 179 additions & 0 deletions src/characterization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import copy
import math
import os.path
import csv

OPERATIONS_MEMORY_ACCESS = ["LWD", "LWI", "SWD","SWI"]
BUS_TYPES = ["ONE-TO-M", "N-TO-M", "INTERLEAVED"]
INTERVAL_CST = 14

def load_operation_characterization(characterization_type, mapping_file='operation_characterization.csv'):
operation_mapping = {}
csv_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), mapping_file)
with open(csv_file_path, 'r') as csvfile:
reader = csv.reader(csvfile)
current_section = None
for row in reader:
if not row or row[0].startswith('#'):
current_section = row[0].strip('# ') if row else current_section
continue
if current_section == f'operation_{characterization_type}_mapping':
operation, *rest = row
if len(rest) == 1:
key = int(rest[0])
operation_mapping[operation] = key
elif len(rest) == 2:
key = int(rest[0])
value = float(rest[1])
if operation not in operation_mapping:
operation_mapping[operation] = {}
operation_mapping[operation][key] = value
return operation_mapping

operation_latency_mapping = load_operation_characterization("latency_cc")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is this code executed? When the module is included? Shouldnt this be in a init_characterization or load_characterization or something? Then we should call these functions from the notebook. I haven't finished reading the code, but i find it weird that all this is working without generating any changes on the notebooks launching the examples right?

bus_type_active_row_coef = load_operation_characterization("active_row_coef")
bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs")

# This function takes the maximum latency between the memory operations and the non-memory operations in the instruction
def get_latency_cc(cgra):
cgra.max_latency_instr = None
longest_alu_op_latency_cc = get_latency_alu_cc(cgra)
total_mem_latency_cc = get_latency_mem_cc(cgra)
cgra.max_latency_instr.latency_cc = max(longest_alu_op_latency_cc, total_mem_latency_cc)
if total_mem_latency_cc > longest_alu_op_latency_cc:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and if not?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This if condition only serves to add "MEM" to the longest operation's name when the mem ops take longer than the non-mem ones. If this is not the case, then the name is just that of the longest non-mem operation.

cgra.max_latency_instr.instr = f'MEM ({cgra.max_latency_instr.instr})'
if (cgra.exit):
cgra.max_latency_instr.latency_cc += 1
cgra.max_latency_instr.instr2exec = cgra.instr2exec
cgra.instr_latency_cc.append(copy.copy(cgra.max_latency_instr))
cgra.total_latency_cc += cgra.instr_latency_cc[-1].latency_cc

def get_latency_alu_cc(cgra):
for r in range(cgra.N_ROWS):
for c in range(cgra.N_COLS):
cgra.cells[r][c].latency_cc = int(operation_latency_mapping[cgra.cells[r][c].op])
if cgra.max_latency_instr is None or cgra.cells[r][c].latency_cc > cgra.max_latency_instr.latency_cc:
cgra.max_latency_instr = cgra.cells[r][c]
return cgra.max_latency_instr.latency_cc

def get_latency_mem_cc(cgra):
record_bank_access(cgra)
cgra.concurrent_accesses = group_dma_accesses(cgra)
dependencies = track_dependencies(cgra)
latency_cc = compute_latency_cc(cgra, dependencies)
return latency_cc

# Record the bank index used for each memory access
def record_bank_access(cgra):
for r in range(cgra.N_ROWS):
for c in range(cgra.N_COLS):
if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS:
cgra.cells[r][c].bank_index = compute_bank_index(cgra,r,c)

def compute_bank_index(cgra, r, c):
base_addr = cgra.init_store[0] if cgra.cells[r][c].op == "SWD" else sorted(cgra.memory)[0][0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont fully understand what this base address stands for, or why it's different for the SWD...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

base_addr corresponds to the address of the first memory element (cgra_input[0][0]). We need this value to find the position of the elements and ultimately return the corresponding bank index. (ie: 9th element accessed -> bank 0).
We need to handle SWD separately due to the starting address.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but why is the starting address related to the bank number?

if cgra.memory_manager.bus_type == "INTERLEAVED":
index_pos = int(((cgra.cells[r][c].addr - base_addr) / cgra.memory_manager.spacing) % cgra.memory_manager.n_banks)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the spacing? if its how many bytes for each address, you may wanna call it alignment_B or word_size_B (my preferred)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add units to variables so that we know if they are in bits (b) , Bytes (B) or words of 4 bytes (w). My standard is to add it as a underscore and then the unit word_size_B, banks_n (with n standing for something that is just a count and therefore has no units)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✔️

else:
index_pos = cgra.cells[r][c].addr / cgra.memory_manager.bank_size
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldnt the one-to-M be another case where the index is always the same (to simulate that they always get a conflict)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✔️

return index_pos

def group_dma_accesses(cgra):
# For each row, scan the PEs for memory accesses and place them into concurrent_accesses
# If a column has no memory access, then scan all the rows on that column (=push up)
cgra.covered_accesses = []
concurrent_accesses = [{} for _ in range(4)]
for r in range(cgra.N_ROWS):
for c in range(cgra.N_COLS):
if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS and (r, c) not in cgra.covered_accesses:
cgra.covered_accesses, concurrent_accesses = mark_access(cgra.covered_accesses, concurrent_accesses, r, c, r, cgra.cells[r][c].bank_index)
else:
for k in range(cgra.N_ROWS):
if cgra.cells[k][c].op in OPERATIONS_MEMORY_ACCESS and (k, c) not in cgra.covered_accesses:
cgra.covered_accesses, concurrent_accesses = mark_access(cgra.covered_accesses, concurrent_accesses, r, c, k, cgra.cells[k][c].bank_index)
break
if not accesses_are_ordered(cgra, concurrent_accesses):
concurrent_accesses = rearrange_accesses(cgra, concurrent_accesses)
return concurrent_accesses

def mark_access(covered_accesses, concurrent_accesses, r, c, k, bank_index):
# Record a PE and its bank index into concurrent_accesses
covered_accesses.append((k, c))
concurrent_accesses[r].setdefault(bank_index, []).append((k, c))
return covered_accesses, concurrent_accesses

def accesses_are_ordered(cgra, concurrent_accesses):
if (cgra.memory_manager.bus_type != "INTERLEAVED"):
return False
else:
highest_row = [0] * 4
for i in range (cgra.N_ROWS):
for values in concurrent_accesses[i].values():
for current_access in values:
if highest_row[i] > current_access[0]:
return False
else:
highest_row[i] = current_access[0]
return True

# This function arranges the concurrent lists to ensure they match the DMA's behavior
def rearrange_accesses(cgra, concurrent_accesses):
if cgra.memory_manager.bus_type == "INTERLEAVED":
for i in range(cgra.N_ROWS - 1, 0, -1):
order_pairs = [pair[1] for pairs in concurrent_accesses[i].values() for pair in pairs][::-1]
concurrent_accesses[i-1] = {key: sorted(pairs, key=lambda x: order_pairs.index(x[1]) if x[1] in order_pairs else float('inf')) for key, pairs in concurrent_accesses[i-1].items()}
else:
# Latencies for non-interleaved bus types require the total number of accesses
concurrent_accesses = [{1: [(0, 0)] * len(cgra.covered_accesses)}, {}, {}, {}]
return concurrent_accesses

def track_dependencies(cgra):
latency = [1] * 4
for i in range (cgra.N_ROWS):
for values in cgra.concurrent_accesses[i].values():
for current_access in values:
# Compare each access with its next dependency (=subsequent access at same column)
# Record the difference between the access within the conflict, and the subsequent access
current_pos = find_position(cgra.concurrent_accesses[i], current_access[1]) + 1
next_pos = find_position(cgra.concurrent_accesses[i+1], current_access[1]) if i < cgra.N_ROWS - 1 else 0
latency[current_access[1]] += current_pos - next_pos
return latency

def find_position(conflict_pos, column):
for pairs in conflict_pos.items():
for pair in pairs[1]:
if pair[1] == column:
return pairs[1].index(pair)
return 0

def compute_latency_cc(cgra, dependencies):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe wanna give it a name that refers to memory? because its not latency in general, but the latency of a memory access, i understand

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✔️

# Account for additional bus type specific delays
ACTIVE_ROW_COEF = bus_type_active_row_coef[cgra.memory_manager.bus_type]
CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[cgra.memory_manager.bus_type]
mem_count = [0] * cgra.N_COLS
latency_cc = max(dependencies)
for r in range(cgra.N_ROWS):
for c in range(cgra.N_COLS):
if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS:
mem_count[c] += 1
if ACTIVE_ROW_COEF:
for i in range (cgra.N_ROWS):
if mem_count[i] != 0:
latency_cc += ACTIVE_ROW_COEF
if CPU_LOOP_INSTRS:
cgra.flag_poll_cnt += latency_cc
if cgra.flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0:
latency_cc += 1
return latency_cc

def display_characterization(cgra, pr):
if any(item in pr for item in ["OP_MAX_LAT", "ALL_LAT_INFO"]):
print("\nLongest instructions per cycle:\n")
print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)"))
for index, item in enumerate(cgra.instr_latency_cc):
print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc))
if any(item in pr for item in ["TOTAL_LAT", "ALL_LAT_INFO"]):
print(f'\nConfiguration time: {len(cgra.instrs)} CC')
cgra.interval_latency = math.ceil(INTERVAL_CST + (len(cgra.instrs) * 3))
print(f'Time between end of configuration and start of first iteration: {cgra.interval_latency} CC')
print(f'Total time for all instructions: {cgra.total_latency_cc}')
32 changes: 1 addition & 31 deletions src/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,34 +22,4 @@ def kernel_new( name, dim=4 ):
csv.writer(f).writerow(["Address", "Data"])
csv.writer(f).writerow(["0", "0"])

print("Kernel", name, "created successfuly!")


def kernel_clear_memory( name, version=""):
import csv
filedir = "./"+name+"/"
with open(filedir + FILENAME_MEM + version + EXT,"w+", newline='') as f:
csv.writer(f).writerow(["Address", "Data"])


def kernel_add_memory_region( name, start, vals, version=""):
import csv
mem = []
region = []
filedir = "./"+name+"/"
for i in range(len(vals)):
region.append([ start + i*WORD_SIZE,vals[i]])

try:
with open(filedir + FILENAME_MEM + version + EXT) as f:
for row in csv.reader(f): mem.append(row)

for row in region: mem.append(row)

with open(filedir + FILENAME_MEM + version + EXT,"w", newline='') as f:
for row in mem: csv.writer(f).writerow(row)
except:
print("Could not open memory file")



print("Kernel", name, "created successfuly!")
Loading