add v1 experiment setup for random walker and ACO

AditiR-42 · Jul 17, 2023 · 919f0cd · 919f0cd
1 parent c5eabbd
commit 919f0cd
Show file tree

Hide file tree

Showing 4 changed files with 228 additions and 127 deletions.
diff --git a/aco/DeepSwarm/deepswarm/backends.py b/aco/DeepSwarm/deepswarm/backends.py
@@ -351,75 +351,69 @@ def __init__(self, path, exp_name, traject_dir, log_dir, reward_formulation, use
         self.reward_formulation = reward_formulation
         self.use_envlogger = use_envlogger
 
-        self.action_dict = {"system": {}, "network": {}}
-        self.action_dict["network"]["topology-name"] = "Hierarchical"
-        self.action_dict["network"]["topologies-per-dim"] = ["Ring", "Ring", "Ring"]
-        self.action_dict["network"]["dimension-type"] = ["N", "N", "N"]
-        # DIMENSION COUNT MUST BE SET TO 3 FOR NOW
-        self.action_dict["network"]["dimensions-count"] = 3  
-        self.action_dict["network"]["units-count"] = [4, 4, 4]
-        self.action_dict["network"]["link-latency"] = [1, 1, 1]
-        self.action_dict["network"]["link-bandwidth"] = [32, 16, 16]
-        self.action_dict["network"]["nic-latency"] = [0, 0, 0]
-        self.action_dict["network"]["router-latency"] = [0, 0, 0]
-        self.action_dict["network"]["hbm-latency"] = [500, 500, 500]
-        self.action_dict["network"]["hbm-bandwidth"] = [370, 370, 370]
-        self.action_dict["network"]["hbm-scale"] = [0, 0, 0]
-        self.action_dict["network"]["links-count"] = [2, 2, 2]
-
-        # system attributes
-        self.action_dict["system"]["scheduling-policy"] = "LIFO"
-        self.action_dict["system"]["endpoint-delay"] = 1
-        self.action_dict["system"]["active-chunks-per-dimension"] = 1
-        self.action_dict["system"]["preferred-dataset-splits"] = 4
-        self.action_dict["system"]["boost-mode"] = 0
-
-        self.action_dict["system"]["all-reduce-implementation"] = "ring_ring_ring"
-        self.action_dict["system"]["all-gather-implementation"] = "ring_ring_ring"
-        self.action_dict["system"]["reduce-scatter-implementation"] = "ring_ring_ring"
-        self.action_dict["system"]["all-to-all-implementation"] = "ring_ring_ring"
-        self.action_dict["system"]["collective-optimization"] = "baseline"
-
-        self.links_count = {"Ring": 2, "FullyConnected": 7, "Switch": 1}
+        self.settings_file_path = os.path.realpath(__file__)
+        self.settings_dir_path = os.path.dirname(self.settings_file_path)
+        self.proj_root_path = os.path.join(self.settings_dir_path, '..', '..', '..')
+
+        self.astrasim_archgym = os.path.join(self.proj_root_path, "sims/AstraSim/astrasim-archgym")
+        self.knobs_spec = os.path.join(self.astrasim_archgym, "dse/archgen_v1_knobs/archgen_v1_knobs_spec.py")
+
+        systems_folder = os.path.join(self.astrasim_archgym, "themis/inputs/system")
+        self.system_file = os.path.join(systems_folder, "3d_fc_ring_switch_baseline.txt")
+
+        # SET UP ACTION DICT
+        self.action_dict = {"network": {}, "workload": {}}
+        self.action_dict["network"]['path'] = "3d_fc_ring_switch.json"
+        self.action_dict["workload"]['path'] = "gnmt_fp16_fused.txt"
+
+        # PARSE SYSTEM FILE
+        self.parse_system(self.system_file, self.action_dict)
 
         for node in path:
-            if hasattr(node, "topology-name"):
-
-                self.action_dict["network"]["topology-name"] = node.topologyName
-                self.action_dict["network"]["topologies-per-dim"] = [node.topologiesPerDim1, node.topologiesPerDim2, node.topologiesPerDim3]
-                self.action_dict["network"]["dimension-type"] = [node.dimensionType, node.dimensionType, node.dimensionType]
-                # ADD CHECK HERE FOR DIMENSION COUNT
-                self.action_dict["network"]["dimensions-count"] = node.dimensionsCount
-                self.action_dict["network"]["units-count"] = [node.unitsCount1, node.unitsCount2, node.unitsCount3]
-
-                self.action_dict["network"]["link-latency"] = [node.linkLatency1, node.linkLatency2, node.linkLatency3]
-                self.action_dict["network"]["linkBandwidth1"] = [node.linkBandwidth1, node.linkBandwidth2, node.linkBandwidth3]
-                self.action_dict["network"]["nic-latency"] = [node.nicLatency1, node.nicLatency2, node.nicLatency3]
-                self.action_dict["network"]["router-latency"] = [node.routerLatency1, node.routerLatency2, node.routerLatency3]
-                self.action_dict["network"]["hbm-latency"] = [node.hbmLatency1, node.hbmLatency2, node.hbmLatency3]
-                self.action_dict["network"]["hbm-bandwidth"] = [node.hbmBandwidth1, node.hbmBandwidth2, node.hbmBandwidth3]
-                self.action_dict["network"]["hbm-scale"] = [node.hbmScale1, node.hbmScale2, node.hbmScale3]
-
-                # Need to map links-count to topology
-                self.action_dict["network"]["links-count"] = [self.links_count[self.action_dict["network"]["topologies-per-dim"][i]]
-                                for i in range(self.action_dict["network"]["dimensions-count"])]
-
-                # system attributes
-                self.action_dict["system"]["scheduling-policy"] = node.schedulingPolicy
-                self.action_dict["system"]["endpoint-delay"] = node.endpointDelay
-                self.action_dict["system"]["active-chunks-per-dimension"] = node.activeChunksPerDimension
-                self.action_dict["system"]["preferred-dataset-splits"] = node.preferredDatasetSplits
-                self.action_dict["system"]["boost-mode"] = node.boostMode
-
-                self.action_dict["system"]["all-reduce-implementation"] = f"{node.allReduceImplementation1}_{node.allReduceImplementation2}_{node.allReduceImplementation3}"
-                self.action_dict["system"]["all-gather-implementation"] = f"{node.allGatherImplementation1}_{node.allGatherImplementation2}_{node.allGatherImplementation3}"
-                self.action_dict["system"]["reduce-scatter-implementation"] = f"{node.reduceScatterImplementation1}_{node.reduceScatterImplementation2}_{node.reduceScatterImplementation3}"
-                self.action_dict["system"]["all-to-all-implementation"] = f"{node.allToAllImplementation1}_{node.allToAllImplementation2}_{node.allToAllImplementation3}"
-
-                self.action_dict["system"]["collective-optimization"] = node.collectiveOptimization
-
+            system_knob, network_knob = self.parse_knobs(self.knobs_spec)
+            dicts = [(system_knob, 'system'), (network_knob, 'network')]
+            if hasattr(node, "topologyName"):
+                nodes_dict = {
+                    "scheduling-policy": node.schedulingPolicy,
+                    "active-chunks-per-dimension": node.activeChunksPerDimension,
+                    "preferred-dataset-splits": node.preferredDatasetSplits,
+                    "collective-optimization": node.collectiveOptimization,
+                    "intra-dimension-scheduling": node.intraDimensionScheduling,
+                    "inter-dimension-scheduling": node.interDimensionScheduling,
+                }
+                for dict_type, dict_name in dicts:
+                    for knob in dict_type.keys():
+                        self.action_dict[dict_name][knob] = nodes_dict[knob]
 
-    # ADD ENVLOGGER STUFF (SEE BELOW)
+
+    def parse_knobs(self, knobs_spec):
+        SYSTEM_KNOBS = {}
+        NETWORK_KNOBS = {}
+
+        with open(knobs_spec, 'r') as file:
+            file_contents = file.read()
+            parsed_dicts = {}
+
+            # Evaluate the file contents and store the dictionaries in the parsed_dicts dictionary
+            exec(file_contents, parsed_dicts)
+
+            # Access the dictionaries
+            SYSTEM_KNOBS = parsed_dicts['SYSTEM_KNOBS']
+            NETWORK_KNOBS = parsed_dicts['NETWORK_KNOBS']
+
+        return SYSTEM_KNOBS, NETWORK_KNOBS  
+
+
+    def parse_system(self, system_file, action_dict):
+        # parse system_file (above is the content) into dict
+        action_dict['system'] = {}
+        with open(system_file, 'r') as file:
+            lines = file.readlines()
+
+            for line in lines:
+                key, value = line.strip().split(': ')
+                action_dict['system'][key] = value
+
 
     # Fit function = step function
     # Environment already calculates reward so don't need calc_reward

diff --git a/arch_gym/envs/AstraSimEnv.py b/arch_gym/envs/AstraSimEnv.py
@@ -14,11 +14,6 @@
 
 sim_path = os.path.join(proj_root_path, "sims", "AstraSim")
 
-# print("!!!!!!!!!!!!!!!!!!!!!!!!!")
-# print(settings_file_path)
-# print(settings_dir_path)
-# print(proj_root_path)
-
 # astra-sim environment
 class AstraSimEnv(gym.Env):
     def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1):
@@ -47,15 +42,18 @@ def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_fo
         self.info = {}
 
         self.exe_path = os.path.join(sim_path, "run_general.sh")
-        self.network_config = os.path.join(sim_path, "general_network.json")
+        # self.network_config = os.path.join(sim_path, "general_network.json")
         self.system_config = os.path.join(sim_path, "general_system.txt")
-        self.workload_config = os.path.join(sim_path, "general_workload.txt")
+
+        # V1 networks, systems, and workloads folder
+        self.networks_folder = os.path.join(sim_path, "astrasim-archgym/dse/archgen_v1_knobs/templates/network")
+        self.workloads_folder = os.path.join(sim_path, "astrasim-archgym/themis/inputs/workload")
+
+        self.network_config = os.path.join(self.networks_folder, "3d_fc_ring_switch.json")
+        self.workload_config = os.path.join(sim_path, "realworld_workloads/transformer_1t_fused_only_t.txt")
+
 
         print("_____________________*****************************_____________________")
-        print(self.exe_path)
-        print(self.network_config)
-        print(self.system_config)
-        print(self.workload_config)
 
         self.reset()
 
@@ -119,9 +117,17 @@ def calculate_reward(self, observations):
     def step(self, action_dict):
 
         # write the three config files
-        with open(self.network_config, "w") as outfile:
-            outfile.write(json.dumps(action_dict['network'], indent=4))
+        # with open(self.network_config, "w") as outfile:
+        #     outfile.write(json.dumps(action_dict['network'], indent=4))
+        if "path" in action_dict["network"]:
+            self.network_config = action_dict["network"]["path"]
+
+        if "path" in action_dict["workload"]:
+            self.workload_config = action_dict["workload"]["path"]
 
+        # load knobs
+        print("system_config")
+        print(action_dict["system"])
         with open(self.system_config, 'w') as file:
             for key, value in action_dict["system"].items():
                 file.write(f'{key}: {value}\n')
@@ -136,8 +142,13 @@ def step(self, action_dict):
             self.counter += 1
 
         # start subrpocess to run the simulation
-        process = subprocess.Popen([self.exe_path, self.network_config, self.system_config, self.workload_config],
-                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        # $1: network, $2: system, $3: workload
+        print("Running simulation...")
+        process = subprocess.Popen([self.exe_path, 
+                                    self.network_config, 
+                                    self.system_config, 
+                                    self.workload_config],
+                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
         # get the output
         out, err = process.communicate()

diff --git a/sims/AstraSim/run_general.sh b/sims/AstraSim/run_general.sh
@@ -4,10 +4,16 @@
 SCRIPT_DIR=$(dirname "$(realpath $0)")
 
 # Absolute paths to useful directories
-BINARY="${SCRIPT_DIR:?}"/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra
-NETWORK="${SCRIPT_DIR:?}"/general_network.json
+BINARY="${SCRIPT_DIR:?}"/astrasim-archgym/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra
 SYSTEM="${SCRIPT_DIR:?}"/general_system.txt
-WORKLOAD="${SCRIPT_DIR:?}"/astra-sim/inputs/workload/Transformer_HybridParallel.txt
+NETWORK="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/network/analytical/$1
+WORKLOAD="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/workload/realworld_workloads/$3
+
+echo "SH NETWORK: ${NETWORK}"
+echo "SH SYSTEM: ${SYSTEM}"
+echo "SH WORKLOAD: ${WORKLOAD}"
+
+# WORKLOAD="${SCRIPT_DIR:?}"/astra-sim/inputs/workload/Transformer_HybridParallel.txt # CHANGE THIS
 STATS="${SCRIPT_DIR:?}"/results/run_general
 
 rm -rf "${STATS}"