diff options
| author | 2024-09-26 05:11:59 +0000 | |
|---|---|---|
| committer | 2024-09-26 05:11:59 +0000 | |
| commit | 7d89c4758beee0710bb1d0c895e5395578987f86 (patch) | |
| tree | 2412361b17c3e5d50f266f12973dfc4eb53447e6 /tools/edit_monitor | |
| parent | fdc342081e0b704cf6b67a40e0dc4d7a193b5791 (diff) | |
| parent | dc2840dafc7d188ee7a4c0fb717fd3d1b2791a99 (diff) | |
Merge "Support monitoring the subprocess in edit monitor" into main
Diffstat (limited to 'tools/edit_monitor')
| -rw-r--r-- | tools/edit_monitor/daemon_manager.py | 92 | ||||
| -rw-r--r-- | tools/edit_monitor/daemon_manager_test.py | 52 |
2 files changed, 142 insertions, 2 deletions
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py index 8ec25886dc..79831a7eeb 100644 --- a/tools/edit_monitor/daemon_manager.py +++ b/tools/edit_monitor/daemon_manager.py @@ -25,6 +25,9 @@ import time DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1 +DEFAULT_MONITOR_INTERVAL_SECONDS = 5 +DEFAULT_MEMORY_USAGE_THRESHOLD = 2000 +DEFAULT_CPU_USAGE_THRESHOLD = 10 def default_daemon_target(): @@ -48,6 +51,9 @@ class DaemonManager: self.pid = os.getpid() self.daemon_process = None + self.max_memory_usage = 0 + self.max_cpu_usage = 0 + pid_file_dir = pathlib.Path(tempfile.gettempdir()).joinpath("edit_monitor") pid_file_dir.mkdir(parents=True, exist_ok=True) self.pid_file_path = self._get_pid_file_path(pid_file_dir) @@ -61,6 +67,50 @@ class DaemonManager: except Exception as e: logging.exception("Failed to start daemon manager with error %s", e) + def monitor_daemon( + self, + interval: int = DEFAULT_MONITOR_INTERVAL_SECONDS, + memory_threshold: float = DEFAULT_MEMORY_USAGE_THRESHOLD, + cpu_threshold: float = DEFAULT_CPU_USAGE_THRESHOLD, + ): + """Monits the daemon process status. + + Periodically check the CPU/Memory usage of the daemon process as long as the + process is still running and kill the process if the resource usage is above + given thresholds. + """ + logging.info("start monitoring daemon process %d.", self.daemon_process.pid) + + while self.daemon_process.is_alive(): + try: + memory_usage = self._get_process_memory_percent(self.daemon_process.pid) + self.max_memory_usage = max(self.max_memory_usage, memory_usage) + + cpu_usage = self._get_process_cpu_percent(self.daemon_process.pid) + self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage) + + time.sleep(interval) + except Exception as e: + # Logging the error and continue. + logging.warning("Failed to monitor daemon process with error: %s", e) + + if ( + self.max_memory_usage >= memory_threshold + or self.max_cpu_usage >= cpu_threshold + ): + logging.error( + "Daemon process is consuming too much resource, killing..." + ), + self._terminate_process(self.daemon_process.pid) + + logging.info( + "Daemon process %d terminated. Max memory usage: %f, Max cpu" + " usage: %f.", + self.daemon_process.pid, + self.max_memory_usage, + self.max_cpu_usage, + ) + def stop(self): """Stops the daemon process and removes the pidfile.""" @@ -180,3 +230,45 @@ class DaemonManager: logging.info("pid_file_path: %s", pid_file_path) return pid_file_path + + def _get_process_memory_percent(self, pid: int) -> float: + try: + with open(f"/proc/{pid}/stat", "r") as f: + stat_data = f.readline().split() + # RSS is the 24th field in /proc/[pid]/stat + rss_pages = int(stat_data[23]) + return rss_pages * 4 / 1024 # Covert to MB + except (FileNotFoundError, IndexError, ValueError, IOError) as e: + logging.exception("Failed to get memory usage.") + raise e + + def _get_process_cpu_percent(self, pid: int, interval: int = 1) -> float: + try: + total_start_time = self._get_total_cpu_time(pid) + with open("/proc/uptime", "r") as f: + uptime_start = float(f.readline().split()[0]) + + time.sleep(interval) + + total_end_time = self._get_total_cpu_time(pid) + with open("/proc/uptime", "r") as f: + uptime_end = float(f.readline().split()[0]) + + return ( + (total_end_time - total_start_time) + / (uptime_end - uptime_start) + * 100 + ) + except (FileNotFoundError, IndexError, ValueError, IOError) as e: + logging.exception("Failed to get CPU usage.") + raise e + + def _get_total_cpu_time(self, pid: int) -> float: + with open(f"/proc/{str(pid)}/stat", "r") as f: + stats = f.readline().split() + # utime is the 14th field in /proc/[pid]/stat measured in clock ticks. + utime = int(stats[13]) + # stime is the 15th field in /proc/[pid]/stat measured in clock ticks. + stime = int(stats[14]) + return (utime + stime) / os.sysconf(os.sysconf_names["SC_CLK_TCK"]) + diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py index 214b0388dc..0c9e04b757 100644 --- a/tools/edit_monitor/daemon_manager_test.py +++ b/tools/edit_monitor/daemon_manager_test.py @@ -43,6 +43,25 @@ def long_running_daemon(): time.sleep(1) +def memory_consume_daemon_target(size_mb): + try: + size_bytes = size_mb * 1024 * 1024 + dummy_data = bytearray(size_bytes) + time.sleep(10) + except MemoryError: + print(f'Process failed to allocate {size_mb} MB of memory.') + + +def cpu_consume_daemon_target(target_usage_percent): + while True: + start_time = time.time() + while time.time() - start_time < target_usage_percent / 100: + pass # Busy loop to consume CPU + + # Sleep to reduce CPU usage + time.sleep(1 - target_usage_percent / 100) + + class DaemonManagerTest(unittest.TestCase): @classmethod @@ -102,7 +121,7 @@ class DaemonManagerTest(unittest.TestCase): def test_start_success_with_existing_instance_from_different_binary(self): # First start an instance based on "some_binary_path" existing_dm = daemon_manager.DaemonManager( - "some_binary_path", + 'some_binary_path', daemon_target=long_running_daemon, ) existing_dm.start() @@ -149,6 +168,35 @@ class DaemonManagerTest(unittest.TestCase): # Verifies no daemon process is started. self.assertIsNone(dm.daemon_process) + def test_monitor_daemon_subprocess_killed_high_memory_usage(self): + dm = daemon_manager.DaemonManager( + TEST_BINARY_FILE, + daemon_target=memory_consume_daemon_target, + daemon_args=(2,), + ) + dm.start() + dm.monitor_daemon(interval=1, memory_threshold=2) + + self.assertTrue(dm.max_memory_usage >= 2) + self.assert_no_subprocess_running() + + def test_monitor_daemon_subprocess_killed_high_cpu_usage(self): + dm = daemon_manager.DaemonManager( + TEST_BINARY_FILE, + daemon_target=cpu_consume_daemon_target, + daemon_args=(20,), + ) + dm.start() + dm.monitor_daemon(interval=1, cpu_threshold=20) + + self.assertTrue(dm.max_cpu_usage >= 20) + self.assert_no_subprocess_running() + + @mock.patch('subprocess.check_output') + def test_monitor_daemon_failed_does_not_matter(self, mock_output): + mock_output.side_effect = OSError('Unknown OSError') + self.assert_run_simple_daemon_success() + def test_stop_success(self): dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=long_running_daemon @@ -194,7 +242,7 @@ class DaemonManagerTest(unittest.TestCase): daemon_args=(damone_output_file.name,), ) dm.start() - dm.daemon_process.join() + dm.monitor_daemon(interval=1) # Verifies the expected pid file is created. expected_pid_file_path = pathlib.Path(self.working_dir.name).joinpath( |