Skip to content

Commit

Permalink
Merge pull request #62 from bibrakc/60-cpi-in-actions
Browse files Browse the repository at this point in the history
60 cpi in actions
  • Loading branch information
bibrakc authored May 2, 2024
2 parents cce5013 + e15c84e commit 479b4de
Show file tree
Hide file tree
Showing 136 changed files with 8,087 additions and 1,749 deletions.
297 changes: 297 additions & 0 deletions Analytics/Area_Energy_Cost_Model/cca_chip_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
#!/usr/bin/env python3

# Copyright (c) 2024, Maciej Andrzej Brodowicz

import math, sys

# All physical quantities expressed in unprefixed SI units

#############################################################################
### inputs
#############################################################################

# Here operon means message.

args = sys.argv

if len(sys.argv) < 3:
print(
"Usage: python cca_chip_perf.py <dim (for a square chip dim^2)> <memory per cc in KB> \
<cycles> <total operon movements> <total actions or total propagates, same thing> \
<bytes payload per operon> <actions executed> <bytes read per action> <network type>"
)
sys.exit(1)

# execution cycles
cycles = 10944 # 1e5
# total operon nearest-neighbor hops
op_hops = 52311459 # 75e6
# total operons
op_count = 1252526 # 1e6
# average operon payload
payload_bytes = 4
# actions executed
action_count = 110119 # 2e6
# bytes per action
action_bytes = 4
# Ruche distance (in nearest neighbor hops)
ruche_len = 3
# Ruche fraction of all operon traffic
ruche_op_hops = 60e6

# fontons per die
fn = 64
# SRAM per fonton
sram_bytes = 2048 * 1024 # 1*1024*1024
# clock [Hz]
freq = 1e9
# flit (operon) size
flit_bits = 256
# word size [bits]
word = 64


# From CMD
fn = int(args[1])
sram_bytes = int(args[2]) * 1024 # Assumes in KB therefore we multiply by 1024
cycles = int(args[3])
op_hops = int(args[4])
op_count = int(args[5])
payload_bytes = int(args[6])
action_count = int(args[7])
action_bytes = int(args[8])
network = args[9]


#############################################################################
### data from Dalorex paper
#############################################################################
# SRAM density (7nm) [byte/m^2]
sram_byte_density = 29.2e6 / 8 / 1e-6

# leakage power (32KB @7nm) [W/byte]
sram_byte_leakage = 16.9e-6 / 32768

# SRAM access cost per access [J/word]
sram_rd_energy = 5.8e-12
sram_wr_energy = 9.12e-12

# NoC: energy to send message bit over unit distance [J/m]
msg_bit_energy = 8e-12 / 32 / 1e-3

#############################################################################
### RV
#############################################################################
# SiFive: RV32E in 13,500 gates
# ZERO_RISCY: 11,600 gates 2-stage pipeline
# RISC V transistor count
rv_T = 12000 * 4

# RISC V area (vanilla core)
rv_area = 0.024e-6

#############################################################################
### transistor counts
#############################################################################
# logic density (TSMC original N7)
Tdensity = 91e12

# basic component transistor counts: flip-flop, n:1 muxes
fdreT = 12
mux2T = 8
mux3T = 2 * mux2T
mux4T = 3 * mux2T
mux6T = 5 * mux2T
mux5T = 4 * mux2T
mux7T = 6 * mux2T
mux8T = 2 * mux4T + mux2T

# from Jung (Ruche nets) paper
# plain Cartesian mesh
# fifos:
ofifoT = 5 * 2 * fdreT * flit_bits
ififoT = 1 * 4 * fdreT * flit_bits
# muxes
fifomuxT = (2 * mux2T + 2 * mux4T + mux5T) * flit_bits
meshT = fifomuxT + ofifoT + ififoT

# 2D torus supposedly 50% more expensive
t2dT = meshT * 1.5

# Ruche
ruche_ififoT = ififoT + 4 * fdreT * flit_bits
ruche_ofifoT = ofifoT
ruche_fifomuxT = (2 * mux3T + 2 * mux6T + mux7T + 2 * mux2T) * flit_bits
rucheT = ruche_fifomuxT + ruche_ofifoT + ruche_ififoT

# Ruche and 2D torus
ruchet2dT = rucheT - meshT + t2dT

# execution unit
euT = 100e3


#############################################################################
def banner(s):
LEN = 50
l = (LEN - len(s)) // 2
if l <= 0:
print("***", s)
return
print(l * "*", s, l * "*")


# fonton area
def farea(netT):
memA = sram_bytes / sram_byte_density
netA = netT / Tdensity
euA = euT / Tdensity
return memA + netA + euA


# cumulative operon transport energy
def net_energy(hops, avg_dist):
return hops * avg_dist * flit_bits * msg_bit_energy


# dynamic memory energy
def mem_energy(word_reads, word_writes):
rdE = word_reads * sram_rd_energy
wrE = word_writes * sram_wr_energy
return rdE + wrE


# static energy
def leakage_power():
return fn * fn * sram_bytes * sram_byte_leakage


# mesh NoC
def mesh_cost():
banner("2D Mesh")
fa = farea(meshT)
side = math.sqrt(fa)

""" print(f"Chip area: {fn*fn*fa*1e6:.6f} mm^2")
print(f"Total CCs: {fn*fn}")
print(f"Total Chip Memory: {(fn*fn*sram_bytes)/(1024*1024)} MB") """

netE = net_energy(op_hops, side)
memE = mem_energy(
op_count * payload_bytes / (word / 8), action_count * action_bytes / (word / 8)
)
time = cycles / freq
totE = netE + memE + leakage_power() * time
""" print(f"Total energy: {totE:.6f} J")
print(f"Average power: {totE/time:.6f} W")
print(f"Total time: {time:.6f} s") """

chip_area = fn * fn * fa * 1e6

return chip_area, totE, time


# 2D torus NoC
def t2d_cost():
banner("2D torus NoC")
fa = farea(t2dT)
side = math.sqrt(fa)
op_dist = 2 * (fn - 1) * side / fn

""" print(f"Chip area: {fn*fn*fa*1e6:.6f} mm^2")
print(f"Total CCs: {fn*fn}")
print(f"Total Chip Memory: {(fn*fn*sram_bytes)/(1024*1024)} MB")
"""
netE = net_energy(op_hops, op_dist)
memE = mem_energy(
op_count * payload_bytes / (word / 8), action_count * action_bytes / (word / 8)
)
time = cycles / freq
totE = netE + memE + leakage_power() * time
""" print(f"Total energy: {totE:.6f} J")
print(f"Average power: {totE/time:.6f} W")
print(f"Total time: {time:.6f} s") """

chip_area = fn * fn * fa * 1e6

return chip_area, totE, time


# Ruche NoC
def ruche_cost():
banner("Ruche NoC")
fa = farea(rucheT)
side = math.sqrt(fa)

print(f"Chip area: {fn*fn*fa*1e6:.6f} mm^2")
print(f"Total CCs: {fn*fn}")
print(f"Total Chip Memory: {(fn*fn*sram_bytes)/(1024*1024)} MB")

if ruche_op_hops > op_hops:
print("Error: count of operon hops over Ruche links greater than total hops")
sys.exit(1)
meshE = net_energy(op_hops - ruche_op_hops, side)
rucheE = net_energy(ruche_op_hops, side * ruche_len)
netE = meshE + rucheE
memE = mem_energy(
op_count * payload_bytes / (word / 8), action_count * action_bytes / (word / 8)
)
time = cycles / freq
totE = netE + memE + leakage_power() * time
print(f"Total energy: {totE:.6f} J")
print(f"Average power: {totE/time:.6f} W")
print(f"Total time: {time:.6f} s")

chip_area = fn * fn * fa * 1e6

return chip_area, totE, time


# 2D torus + Ruche NoC
def ruchet2d_cost():
banner("2D torus and Ruche NoC")
fa = farea(ruchet2dT)
side = math.sqrt(fa)

print(f"Chip area: {fn*fn*fa*1e6:.6f} mm^2")
print(f"Total CCs: {fn*fn}")
print(f"Total Chip Memory: {(fn*fn*sram_bytes)/(1024*1024)} MB")

if ruche_op_hops > op_hops:
print("Error: count of operon hops over Ruche links greater than total hops")
sys.exit(1)
t2d_dist = 2 * (fn - 1) * side / fn
t2dE = net_energy(op_hops - ruche_op_hops, t2d_dist)
rucheE = net_energy(ruche_op_hops, side * ruche_len)
netE = t2dE + rucheE
memE = mem_energy(
op_count * payload_bytes / (word / 8), action_count * action_bytes / (word / 8)
)
time = cycles / freq
totE = netE + memE + leakage_power() * time
print(f"Total energy: {totE:.6f} J")
print(f"Average power: {totE/time:.6f} W")
print(f"Total time: {time:.6f} s")

chip_area = fn * fn * fa * 1e6

return chip_area, totE, time


#############################################################################
if __name__ == "__main__":
if network == "MESH":
chip_area, totE, time = mesh_cost()
# print(chip_area, totE, time
print(f"Chip area: {chip_area:.6f} mm^2")
print(f"Total energy: {totE:.6f} J")
print(f"Total time: {time:.6f} s")
elif network == "TORUS":
chip_area, totE, time = t2d_cost()
print(f"Chip area: {chip_area:.6f} mm^2")
print(f"Total energy: {totE:.6f} J")
print(f"Total time: {time:.6f} s")
else:
print("Invalid Network! ERROR")
# ruche_cost()
# ruchet2d_cost()
21 changes: 12 additions & 9 deletions Analytics/Post_Processing/post_processing_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@

# read the per cycle active status data
active_status_per_cycle = [] # stores the active status
for i in range(0, cycles): # cycles all cycles
for i in range(0, 3): # cycles all cycles
line = file.readline()
line = line.strip().split("\t")
cycle = int(line[0])
Expand Down Expand Up @@ -264,12 +264,12 @@ def thousands_formatter(x, pos):
def congestion_charts():

# Create a figure with subplots using gridspec_kw for shared boundaries
fig, axes = plt.subplots(1, 4, figsize=(9, 2.5), sharey=True)
fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True)

# Flatten the axes array to easily iterate over subplots
axes = axes.flatten()

bins = 15
bins = 25

# blue: '#2F5597'
# red: '#B00002'
Expand Down Expand Up @@ -303,22 +303,25 @@ def congestion_charts():
)
axes[3].set_xlabel("") # Remove x-axis label for the second subplot

font_size = 16
font_size_axis = 18

# Set titles for each subplot
titles = ["West Channel", "North Channel", "East Channel", "South Channel"]
axes[0].set_ylabel("Count of Compute Cells", fontsize=12, fontweight="bold")
axes[0].tick_params(axis="y", labelsize=12)
axes[0].set_ylabel("Count of Compute Cells", fontsize=font_size, fontweight="bold")
axes[0].tick_params(axis="y", labelsize=font_size_axis)
for ax, title in zip(axes, titles):
ax.set_title(title, fontsize=12, fontweight="bold")
ax.tick_params(axis="x", labelsize=12)
ax.set_title(title, fontsize=font_size_axis, fontweight="bold")
ax.tick_params(axis="x", labelsize=font_size_axis, rotation=90)
# ax.set_xlabel('Cycles Spent in Contention', fontsize=11, fontweight='bold')
ax.grid(axis="y", color="gray", linestyle="dashed", linewidth=1)

# Format x-axis ticks using the thousands_formatter function
for ax in axes:
ax.xaxis.set_major_formatter(FuncFormatter(thousands_formatter))
ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
# ax.set_xlim(0, 400000) # Set the x-axis limit
ax.set_ylim(0, 2500) # Set the y-axis limit
ax.set_xlim(0, 600000) # Set the x-axis limit
ax.set_ylim(0, 1500) # Set the y-axis limit

# Adjust spacing between subplots
plt.tight_layout()
Expand Down
5 changes: 5 additions & 0 deletions Applications/Breadth_First_Search/cca_bfs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ bfs_predicate_T(ComputeCell& cc,
// actionType /* action_type_in */,
const ActionArgumentType args) -> Closure
{
cc.apply_CPI(1);

// First check whether this is a ghost vertex.If it is then always predicate true.
// parent word is used in the sense that `RecursiveParallelVertex` is the parent class.
auto* parent_recursive_parralel_vertex = static_cast<ghost_type*>(cc.get_object(addr));
Expand Down Expand Up @@ -126,6 +128,8 @@ bfs_work_T(ComputeCell& cc,
// actionType /* action_type_in */,
const ActionArgumentType args) -> Closure
{
cc.apply_CPI(1);

// First check whether this is a ghost vertex. If it is then don't perform any work.
// parent word is used in the sense that `RecursiveParallelVertex` is the parent class.
auto* parent_recursive_parralel_vertex = static_cast<ghost_type*>(cc.get_object(addr));
Expand Down Expand Up @@ -160,6 +164,7 @@ bfs_diffuse_predicate_T(ComputeCell& cc,
// actionType /* action_type_in */,
const ActionArgumentType args) -> Closure
{
cc.apply_CPI(1);

// First check whether this is a ghost vertex. If it is then always predicate true.
// parent word is used in the sense that `RecursiveParallelVertex` is the parent class.
Expand Down
Loading

0 comments on commit 479b4de

Please sign in to comment.