diff --git a/aco/DeepSwarm/deepswarm/backends.py b/aco/DeepSwarm/deepswarm/backends.py index 9769c21c..4d541a5b 100644 --- a/aco/DeepSwarm/deepswarm/backends.py +++ b/aco/DeepSwarm/deepswarm/backends.py @@ -351,75 +351,69 @@ def __init__(self, path, exp_name, traject_dir, log_dir, reward_formulation, use self.reward_formulation = reward_formulation self.use_envlogger = use_envlogger - self.action_dict = {"system": {}, "network": {}} - self.action_dict["network"]["topology-name"] = "Hierarchical" - self.action_dict["network"]["topologies-per-dim"] = ["Ring", "Ring", "Ring"] - self.action_dict["network"]["dimension-type"] = ["N", "N", "N"] - # DIMENSION COUNT MUST BE SET TO 3 FOR NOW - self.action_dict["network"]["dimensions-count"] = 3 - self.action_dict["network"]["units-count"] = [4, 4, 4] - self.action_dict["network"]["link-latency"] = [1, 1, 1] - self.action_dict["network"]["link-bandwidth"] = [32, 16, 16] - self.action_dict["network"]["nic-latency"] = [0, 0, 0] - self.action_dict["network"]["router-latency"] = [0, 0, 0] - self.action_dict["network"]["hbm-latency"] = [500, 500, 500] - self.action_dict["network"]["hbm-bandwidth"] = [370, 370, 370] - self.action_dict["network"]["hbm-scale"] = [0, 0, 0] - self.action_dict["network"]["links-count"] = [2, 2, 2] - - # system attributes - self.action_dict["system"]["scheduling-policy"] = "LIFO" - self.action_dict["system"]["endpoint-delay"] = 1 - self.action_dict["system"]["active-chunks-per-dimension"] = 1 - self.action_dict["system"]["preferred-dataset-splits"] = 4 - self.action_dict["system"]["boost-mode"] = 0 - - self.action_dict["system"]["all-reduce-implementation"] = "ring_ring_ring" - self.action_dict["system"]["all-gather-implementation"] = "ring_ring_ring" - self.action_dict["system"]["reduce-scatter-implementation"] = "ring_ring_ring" - self.action_dict["system"]["all-to-all-implementation"] = "ring_ring_ring" - self.action_dict["system"]["collective-optimization"] = "baseline" - - self.links_count = {"Ring": 2, "FullyConnected": 7, "Switch": 1} + self.settings_file_path = os.path.realpath(__file__) + self.settings_dir_path = os.path.dirname(self.settings_file_path) + self.proj_root_path = os.path.join(self.settings_dir_path, '..', '..', '..') + + self.astrasim_archgym = os.path.join(self.proj_root_path, "sims/AstraSim/astrasim-archgym") + self.knobs_spec = os.path.join(self.astrasim_archgym, "dse/archgen_v1_knobs/archgen_v1_knobs_spec.py") + + systems_folder = os.path.join(self.astrasim_archgym, "themis/inputs/system") + self.system_file = os.path.join(systems_folder, "3d_fc_ring_switch_baseline.txt") + + # SET UP ACTION DICT + self.action_dict = {"network": {}, "workload": {}} + self.action_dict["network"]['path'] = "3d_fc_ring_switch.json" + self.action_dict["workload"]['path'] = "gnmt_fp16_fused.txt" + + # PARSE SYSTEM FILE + self.parse_system(self.system_file, self.action_dict) for node in path: - if hasattr(node, "topology-name"): - - self.action_dict["network"]["topology-name"] = node.topologyName - self.action_dict["network"]["topologies-per-dim"] = [node.topologiesPerDim1, node.topologiesPerDim2, node.topologiesPerDim3] - self.action_dict["network"]["dimension-type"] = [node.dimensionType, node.dimensionType, node.dimensionType] - # ADD CHECK HERE FOR DIMENSION COUNT - self.action_dict["network"]["dimensions-count"] = node.dimensionsCount - self.action_dict["network"]["units-count"] = [node.unitsCount1, node.unitsCount2, node.unitsCount3] - - self.action_dict["network"]["link-latency"] = [node.linkLatency1, node.linkLatency2, node.linkLatency3] - self.action_dict["network"]["linkBandwidth1"] = [node.linkBandwidth1, node.linkBandwidth2, node.linkBandwidth3] - self.action_dict["network"]["nic-latency"] = [node.nicLatency1, node.nicLatency2, node.nicLatency3] - self.action_dict["network"]["router-latency"] = [node.routerLatency1, node.routerLatency2, node.routerLatency3] - self.action_dict["network"]["hbm-latency"] = [node.hbmLatency1, node.hbmLatency2, node.hbmLatency3] - self.action_dict["network"]["hbm-bandwidth"] = [node.hbmBandwidth1, node.hbmBandwidth2, node.hbmBandwidth3] - self.action_dict["network"]["hbm-scale"] = [node.hbmScale1, node.hbmScale2, node.hbmScale3] - - # Need to map links-count to topology - self.action_dict["network"]["links-count"] = [self.links_count[self.action_dict["network"]["topologies-per-dim"][i]] - for i in range(self.action_dict["network"]["dimensions-count"])] - - # system attributes - self.action_dict["system"]["scheduling-policy"] = node.schedulingPolicy - self.action_dict["system"]["endpoint-delay"] = node.endpointDelay - self.action_dict["system"]["active-chunks-per-dimension"] = node.activeChunksPerDimension - self.action_dict["system"]["preferred-dataset-splits"] = node.preferredDatasetSplits - self.action_dict["system"]["boost-mode"] = node.boostMode - - self.action_dict["system"]["all-reduce-implementation"] = f"{node.allReduceImplementation1}_{node.allReduceImplementation2}_{node.allReduceImplementation3}" - self.action_dict["system"]["all-gather-implementation"] = f"{node.allGatherImplementation1}_{node.allGatherImplementation2}_{node.allGatherImplementation3}" - self.action_dict["system"]["reduce-scatter-implementation"] = f"{node.reduceScatterImplementation1}_{node.reduceScatterImplementation2}_{node.reduceScatterImplementation3}" - self.action_dict["system"]["all-to-all-implementation"] = f"{node.allToAllImplementation1}_{node.allToAllImplementation2}_{node.allToAllImplementation3}" - - self.action_dict["system"]["collective-optimization"] = node.collectiveOptimization - + system_knob, network_knob = self.parse_knobs(self.knobs_spec) + dicts = [(system_knob, 'system'), (network_knob, 'network')] + if hasattr(node, "topologyName"): + nodes_dict = { + "scheduling-policy": node.schedulingPolicy, + "active-chunks-per-dimension": node.activeChunksPerDimension, + "preferred-dataset-splits": node.preferredDatasetSplits, + "collective-optimization": node.collectiveOptimization, + "intra-dimension-scheduling": node.intraDimensionScheduling, + "inter-dimension-scheduling": node.interDimensionScheduling, + } + for dict_type, dict_name in dicts: + for knob in dict_type.keys(): + self.action_dict[dict_name][knob] = nodes_dict[knob] - # ADD ENVLOGGER STUFF (SEE BELOW) + + def parse_knobs(self, knobs_spec): + SYSTEM_KNOBS = {} + NETWORK_KNOBS = {} + + with open(knobs_spec, 'r') as file: + file_contents = file.read() + parsed_dicts = {} + + # Evaluate the file contents and store the dictionaries in the parsed_dicts dictionary + exec(file_contents, parsed_dicts) + + # Access the dictionaries + SYSTEM_KNOBS = parsed_dicts['SYSTEM_KNOBS'] + NETWORK_KNOBS = parsed_dicts['NETWORK_KNOBS'] + + return SYSTEM_KNOBS, NETWORK_KNOBS + + + def parse_system(self, system_file, action_dict): + # parse system_file (above is the content) into dict + action_dict['system'] = {} + with open(system_file, 'r') as file: + lines = file.readlines() + + for line in lines: + key, value = line.strip().split(': ') + action_dict['system'][key] = value + # Fit function = step function # Environment already calculates reward so don't need calc_reward diff --git a/arch_gym/envs/AstraSimEnv.py b/arch_gym/envs/AstraSimEnv.py index 0a3dafd9..f561e495 100644 --- a/arch_gym/envs/AstraSimEnv.py +++ b/arch_gym/envs/AstraSimEnv.py @@ -14,11 +14,6 @@ sim_path = os.path.join(proj_root_path, "sims", "AstraSim") -# print("!!!!!!!!!!!!!!!!!!!!!!!!!") -# print(settings_file_path) -# print(settings_dir_path) -# print(proj_root_path) - # astra-sim environment class AstraSimEnv(gym.Env): def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1): @@ -47,15 +42,18 @@ def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_fo self.info = {} self.exe_path = os.path.join(sim_path, "run_general.sh") - self.network_config = os.path.join(sim_path, "general_network.json") + # self.network_config = os.path.join(sim_path, "general_network.json") self.system_config = os.path.join(sim_path, "general_system.txt") - self.workload_config = os.path.join(sim_path, "general_workload.txt") + + # V1 networks, systems, and workloads folder + self.networks_folder = os.path.join(sim_path, "astrasim-archgym/dse/archgen_v1_knobs/templates/network") + self.workloads_folder = os.path.join(sim_path, "astrasim-archgym/themis/inputs/workload") + + self.network_config = os.path.join(self.networks_folder, "3d_fc_ring_switch.json") + self.workload_config = os.path.join(sim_path, "realworld_workloads/transformer_1t_fused_only_t.txt") + print("_____________________*****************************_____________________") - print(self.exe_path) - print(self.network_config) - print(self.system_config) - print(self.workload_config) self.reset() @@ -119,9 +117,17 @@ def calculate_reward(self, observations): def step(self, action_dict): # write the three config files - with open(self.network_config, "w") as outfile: - outfile.write(json.dumps(action_dict['network'], indent=4)) + # with open(self.network_config, "w") as outfile: + # outfile.write(json.dumps(action_dict['network'], indent=4)) + if "path" in action_dict["network"]: + self.network_config = action_dict["network"]["path"] + + if "path" in action_dict["workload"]: + self.workload_config = action_dict["workload"]["path"] + # load knobs + print("system_config") + print(action_dict["system"]) with open(self.system_config, 'w') as file: for key, value in action_dict["system"].items(): file.write(f'{key}: {value}\n') @@ -136,8 +142,13 @@ def step(self, action_dict): self.counter += 1 # start subrpocess to run the simulation - process = subprocess.Popen([self.exe_path, self.network_config, self.system_config, self.workload_config], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # $1: network, $2: system, $3: workload + print("Running simulation...") + process = subprocess.Popen([self.exe_path, + self.network_config, + self.system_config, + self.workload_config], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) # get the output out, err = process.communicate() diff --git a/sims/AstraSim/run_general.sh b/sims/AstraSim/run_general.sh index 9f1860dc..9c9aa942 100755 --- a/sims/AstraSim/run_general.sh +++ b/sims/AstraSim/run_general.sh @@ -4,10 +4,16 @@ SCRIPT_DIR=$(dirname "$(realpath $0)") # Absolute paths to useful directories -BINARY="${SCRIPT_DIR:?}"/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra -NETWORK="${SCRIPT_DIR:?}"/general_network.json +BINARY="${SCRIPT_DIR:?}"/astrasim-archgym/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra SYSTEM="${SCRIPT_DIR:?}"/general_system.txt -WORKLOAD="${SCRIPT_DIR:?}"/astra-sim/inputs/workload/Transformer_HybridParallel.txt +NETWORK="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/network/analytical/$1 +WORKLOAD="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/workload/realworld_workloads/$3 + +echo "SH NETWORK: ${NETWORK}" +echo "SH SYSTEM: ${SYSTEM}" +echo "SH WORKLOAD: ${WORKLOAD}" + +# WORKLOAD="${SCRIPT_DIR:?}"/astra-sim/inputs/workload/Transformer_HybridParallel.txt # CHANGE THIS STATS="${SCRIPT_DIR:?}"/results/run_general rm -rf "${STATS}" diff --git a/sims/AstraSim/trainRandomWalkerAstraSim.py b/sims/AstraSim/trainRandomWalkerAstraSim.py index 8f12ace8..0f7a31c1 100644 --- a/sims/AstraSim/trainRandomWalkerAstraSim.py +++ b/sims/AstraSim/trainRandomWalkerAstraSim.py @@ -1,5 +1,6 @@ import os import sys +import pickle from absl import flags from absl import app @@ -34,9 +35,6 @@ def write_network(dimension): - def dim_helper(dim, val): - return [val for _ in range(dim)] - def rand_dim_helper(dim, vals): return [random.choice(vals) for _ in range(dim)] @@ -52,22 +50,22 @@ def rand_float_helper(dim, min, max): "topology-name": random.choice(["Hierarchical"]), "topologies-per-dim": rand_dim_helper(dimension, ["Ring", "FullyConnected", "Switch"]), # NEED TO CHECK HOW RANDOM DIM TYPE CAN BE - "dimension-type": dim_helper(dimension, random.choice(["N", "P"])), + # "dimension-type": rand_dim_helper(dimension, ["T", "N", "P"]), + "dimension-type": rand_dim_helper(dimension, ["N"]), + # "dimensions-count": (int, 1, 5, 2), "dimensions-count": dimension, - "units-count": rand_num_helper(dimension, 2, 8), - "link-latency": rand_num_helper(dimension, 1, 500), - "link-bandwidth": rand_float_helper(dimension, 12.0, 250.0), + "units-count": rand_num_helper(dimension, 2, 1024), + "links-count": rand_num_helper(dimension, 1, 10), + "link-latency": rand_num_helper(dimension, 1, 1000), + "link-bandwidth": rand_float_helper(dimension, 0.00001, 100000), # SHOULD THIS BE ONLY ZEROS? - "nic-latency": rand_num_helper(dimension, 0, 0), - "router-latency": rand_num_helper(dimension, 0, 10), - "hbm-latency": rand_num_helper(dimension, 1, 500), - "hbm-bandwidth": rand_num_helper(dimension, 1, 500), - "hbm-scale": rand_num_helper(dimension, 0, 1), + "nic-latency": rand_num_helper(dimension, 0, 1000), + "router-latency": rand_num_helper(dimension, 0, 1000), + "hbm-latency": rand_num_helper(dimension, 1, 1), + "hbm-bandwidth": rand_num_helper(dimension, 1, 1), + "hbm-scale": rand_num_helper(dimension, 0, 0), } - network["links-count"] = [links_count[network["topologies-per-dim"][i]] - for i in range(dimension)] - return network @@ -83,16 +81,19 @@ def implementation_helper(dim, val): system = { "scheduling-policy": random.choice(["LIFO", "FIFO"]), - "endpoint-delay": random.randint(1, 10), - "active-chunks-per-dimension": 1, + "endpoint-delay": random.randint(1, 1000), + "active-chunks-per-dimension": random.randint(1, 32), # whenever dataset splits is high, it takes a long time to run - "preferred-dataset-splits": random.randint(1, 32), - "boost-mode": random.randint(0, 1), - "all-reduce-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect"])), - "all-gather-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect"])), - "reduce-scatter-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect"])), - "all-to-all-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect"])), - "collective-optimization": random.choice(["baseline", "localBWAware"]) + "preferred-dataset-splits": random.randint(16, 1024), + "boost-mode": 1, + "all-reduce-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), + "all-gather-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), + "reduce-scatter-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), + "all-to-all-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), + + "collective-optimization": random.choice(["baseline", "localBWAware"]), + "intra-dimension-scheduling": random.choice(["FIFO", "SCF"]), + "inter-dimension-scheduling": random.choice(["baseline", "themis"]) } return system @@ -163,13 +164,79 @@ def write_workload(): return {"value": value} -def generate_random_actions(dimension): - action = {} - action['network'] = write_network(dimension) - action['system'] = write_system(dimension) - # action['workload'] = write_workload() +# parses the network file +# def parse_network(network_file): +# with open(network_file) as f: +# network = json.load(f) +# return network + + +# systems: parse from file into json into generate_random_actions +""" +system_file content: +scheduling-policy: LIFO +endpoint-delay: 1 +active-chunks-per-dimension: 1 +preferred-dataset-splits: 64 +boost-mode: 1 +all-reduce-implementation: direct_ring_halvingDoubling +all-gather-implementation: direct_ring_halvingDoubling +reduce-scatter-implementation: direct_ring_halvingDoubling +all-to-all-implementation: direct_direct_direct +collective-optimization: localBWAware +intra-dimension-scheduling: FIFO +inter-dimension-scheduling: baseline +""" +def parse_system(system_file, action_dict): + # parse system_file (above is the content) into dict + action_dict['system'] = {} + with open(system_file, 'r') as file: + lines = file.readlines() + + for line in lines: + key, value = line.strip().split(': ') + action_dict['system'][key] = value + + + + +# def parse_workload(workload_file): + - return action + +# parses knobs that we want to experiment with +def parse_knobs(knobs_spec): + SYSTEM_KNOBS = {} + NETWORK_KNOBS = {} + + with open(knobs_spec, 'r') as file: + file_contents = file.read() + parsed_dicts = {} + + # Evaluate the file contents and store the dictionaries in the parsed_dicts dictionary + exec(file_contents, parsed_dicts) + + # Access the dictionaries + SYSTEM_KNOBS = parsed_dicts['SYSTEM_KNOBS'] + NETWORK_KNOBS = parsed_dicts['NETWORK_KNOBS'] + + return SYSTEM_KNOBS, NETWORK_KNOBS + + + +# action_type = specify 'network' or 'system +# new_params = parsed knobs from experiment file +def generate_random_actions(action_dict, system_knob, network_knob): + dicts = [(system_knob, 'system'), (network_knob, 'network')] + for dict_type, dict_name in dicts: + for knob in dict_type.keys(): + if isinstance(dict_type[knob], set): + action_dict[dict_name][knob] = random.choice(list(dict_type[knob])) + else: + action_dict[dict_name][knob] = random.randint(dict_type[knob][1], dict_type[knob][2]) + + return action_dict + def log_results_to_csv(filename, fitness_dict): df = pd.DataFrame([fitness_dict['reward']]) @@ -209,11 +276,27 @@ def main(_): settings_dir_path = os.path.dirname(settings_file_path) proj_root_path = os.path.abspath(settings_dir_path) + astrasim_archgym = os.path.join(proj_root_path, "astrasim-archgym") + + # TODO: V1 SPEC: + archgen_v1_knobs = os.path.join(astrasim_archgym, "dse/archgen_v1_knobs") + knobs_spec = os.path.join(archgen_v1_knobs, "archgen_v1_knobs_spec.py") + networks_folder = os.path.join(archgen_v1_knobs, "templates/network") + systems_folder = os.path.join(astrasim_archgym, "themis/inputs/system") + workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") + + # DEFINE NETWORK AND SYSTEM AND WORKLOAD + network_file = "3d_fc_ring_switch.json" + system_file = os.path.join(systems_folder, "3d_fc_ring_switch_baseline.txt") + workload_file = "gnmt_fp16_fused.txt" + + exe_path = os.path.join(proj_root_path, "run_general.sh") network_config = os.path.join(proj_root_path, "general_network.json") system_config = os.path.join(proj_root_path, "general_system.txt") workload_config = os.path.join(proj_root_path, "general_workload.txt") + env = AstraSimWrapper.make_astraSim_env(rl_form='random_walker') # env = AstraSimEnv.AstraSimEnv(rl_form='random_walker') @@ -234,43 +317,50 @@ def main(_): env = wrap_in_envlogger(env, traject_dir) # get the dimension of the network + # the dimension is now defined in the template dimension = random.randint(2, 3) start = time.time() step_results = {} + + # INITIATE action dict + action_dict = {} + + # TODO: load network and workloads + action_dict['network'] = {"path": network_file} + action_dict['workload'] = {"path": workload_file} + + # TODO: parse system + parse_system(system_file, action_dict) + + # TODO: parse knobs (all variables to change in action_dict) + system_knob, network_knob = parse_knobs(knobs_spec) for i in range(FLAGS.num_episodes): logging.info('Episode %r', i) + # every step of the current training for step in range(FLAGS.num_steps): - # generate random actions - action = generate_random_actions(dimension) - - # write the three config files - with open("general_network.json", "w") as outfile: - outfile.write(json.dumps(action['network'], indent=4)) - - with open("general_system.txt", 'w') as file: - for key, value in action["system"].items(): - file.write(f'{key}: {value}\n') + # pass into generate_random_actions(dimension, knobs) + action_dict = generate_random_actions(action_dict, system_knob, network_knob) # with open("general_workload.txt", 'w') as file: # file.write(action["workload"]["value"]) # step_result wrapped in TimeStep object - step_result = env.step({}) + step_result = env.step(action_dict) step_type, reward, discount, observation = step_result step_results['reward'] = [reward] - step_results['action'] = action + step_results['action'] = action_dict step_results['obs'] = observation log_results_to_csv(log_path, step_results) end = time.time() - print("Total Time taken: ", end - start) + print("Total Time Taken: ", end - start) print("Total Useful Steps: ", env.useful_counter)