diff --git a/omnistat/omni_util.py b/omnistat/omni_util.py index 8327bcd2..3aa377f6 100755 --- a/omnistat/omni_util.py +++ b/omnistat/omni_util.py @@ -52,12 +52,11 @@ def __init__(self): self.scrape_interval = 30 # default scrape interval in seconds self.timeout = 5 # default scrape timeout in seconds self.__hosts = None + self.__RMS_Detected = False def setup(self, configFileArgument): self.configFile = utils.findConfigFile(configFileArgument) self.runtimeConfig = utils.readConfig(self.configFile) - self.rmsDetection() - self.getRMSHosts() # Path to Omnistat's executable scripts. For source deployments, this # is the top directory of a working copy of Omnistat. For package @@ -73,6 +72,10 @@ def setMonitoringInterval(self, interval): def rmsDetection(self): """Query environment to infer resource manager""" + + if self.__RMS_Detected: + return + if "SLURM_JOB_NODELIST" in os.environ: self.__rms = "slurm" elif "FLUX_URI" in os.environ: @@ -80,6 +83,10 @@ def rmsDetection(self): else: utils.error("Unknown/unsupported resource manager") logging.info("RMS detected = %s" % self.__rms) + + self.getRMSHosts() + self.__RMS_Detected = True + return def getRMSHosts(self): @@ -126,7 +133,13 @@ def startVictoriaServer(self): vm_logfile = self.runtimeConfig[section].get("victoria_logfile", "victoria_server.log") vm_corebinding = self.runtimeConfig[section].getint("victoria_corebinding", None) - command = [vm_binary, "--storageDataPath=%s" % vm_datadir, "-memory.allowedPercent=10"] + command = [ + vm_binary, + "--storageDataPath=%s" % vm_datadir, + "-memory.allowedPercent=10", + "-retentionPeriod=10y", + "-httpListenAddr=:9090", + ] envAddition = {} # restrict thread usage envAddition["GOMAXPROCS"] = "4" @@ -146,6 +159,8 @@ def startPromServer(self, victoriaMode=False): self.startVictoriaServer() return + self.rmsDetection() + logging.info("Starting prometheus server on localhost") if self.scrape_interval >= 1: scrape_interval = "%ss" % int(self.scrape_interval) @@ -187,14 +202,6 @@ def startPromServer(self, victoriaMode=False): "static_configs": [computes], } ) - # if remoteWrite: - # auth = { - # "username": remoteWriteConfig["auth_user"], - # "password": remoteWriteConfig["auth_cred"], - # } - - # prom_config["remote_write"] = [] - # prom_config["remote_write"].append({"url": remoteWriteConfig["url"], "basic_auth": auth}) with open("prometheus.yml", "w") as yaml_file: yaml.dump(prom_config, yaml_file, sort_keys=False) @@ -238,6 +245,8 @@ def startExporters(self, victoriaMode=False): ssh_key = self.runtimeConfig["omnistat.usermode"].get("ssh_key", "~/.ssh/id_rsa") corebinding = self.runtimeConfig["omnistat.usermode"].getint("exporter_corebinding", None) + self.rmsDetection() + if victoriaMode: if os.path.exists("./exporter.log"): os.remove("./exporter.log") @@ -349,7 +358,7 @@ def startExporters(self, victoriaMode=False): return def stopExporters(self, victoriaMode=False): - + self.rmsDetection() port = self.runtimeConfig["omnistat.collectors"].get("port", "8001") for host in self.__hosts: logging.info("Stopping exporter for host -> %s" % host) diff --git a/omnistat/standalone.py b/omnistat/standalone.py index a35dd01f..c725df7b 100755 --- a/omnistat/standalone.py +++ b/omnistat/standalone.py @@ -181,7 +181,6 @@ def polling(self, monitor, interval_secs): logging.info("Previous metric push is still running - blocking till complete.") push_thread.join() logging.info("Resuming after previous metric push complete.") - try: push_start_time = time.perf_counter() dataToPush = self.__dataVM @@ -256,7 +255,7 @@ def parse_args(): parser.add_argument("--interval", type=float, help="sampling frequency (in secs)", default=0.5) parser.add_argument("--logfile", type=str, help="redirect stdout to logfile", default=None) parser.add_argument("--endpoint", type=str, help="hostname of VictoriaMetrics server", default="localhost") - parser.add_argument("--port", type=int, help="port to access VictoriaMetrics server", default=8428) + parser.add_argument("--port", type=int, help="port to access VictoriaMetrics server", default=9090) return parser.parse_args()