diff options
author | 2024-12-04 17:39:54 +0000 | |
---|---|---|
committer | 2024-12-04 21:09:44 +0000 | |
commit | a7107913d9ed38b23d140fe29eef1ff374663902 (patch) | |
tree | 603c7aa29ff5ce6a91989a8a7875d873bb19bf65 | |
parent | 3b2cf40ee8595d3cc88b66a86dede83e56228a8d (diff) |
Fix a race condition in edit monitor
When there are multiple edit monitor starting around the same time,
there will be cases that the pidfile is deleted or overridden as
multiple processes are trying to remove and create the pidfile around
the same time (see details in b/382135550).
This cl fix it by checking the pid in the pidfile and
only remove the pidfile if the pid stored there is the expected one.
Bug: 382135550
Test: atest edit_monitor_integration_test
Change-Id: Ifdb4c29c8b36052fea83ce59de1be9d99db3a852
-rw-r--r-- | tools/edit_monitor/daemon_manager.py | 42 | ||||
-rw-r--r-- | tools/edit_monitor/edit_monitor_integration_test.py | 38 | ||||
-rw-r--r-- | tools/edit_monitor/main.py | 3 |
3 files changed, 65 insertions, 18 deletions
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py index 0c31ab85c9..a352e30291 100644 --- a/tools/edit_monitor/daemon_manager.py +++ b/tools/edit_monitor/daemon_manager.py @@ -149,15 +149,15 @@ class DaemonManager: edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE ) logging.error( - "Daemon process is consuming too much memory, rebooting...") + "Daemon process is consuming too much memory, rebooting..." + ) self.reboot() if self.max_cpu_usage >= cpu_threshold: self._send_error_event_to_clearcut( edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE ) - logging.error( - "Daemon process is consuming too much cpu, killing...") + logging.error("Daemon process is consuming too much cpu, killing...") self._terminate_process(self.daemon_process.pid) logging.info( @@ -179,7 +179,7 @@ class DaemonManager: self._wait_for_process_terminate(self.daemon_process.pid, 1) if self.daemon_process.is_alive(): self._terminate_process(self.daemon_process.pid) - self._remove_pidfile() + self._remove_pidfile(self.pid) logging.info("Successfully stopped daemon manager.") except Exception as e: logging.exception("Failed to stop daemon manager with error %s", e) @@ -253,11 +253,15 @@ class DaemonManager: if ex_pid: logging.info("Found another instance with pid %d.", ex_pid) self._terminate_process(ex_pid) - self._remove_pidfile() + self._remove_pidfile(ex_pid) - def _read_pid_from_pidfile(self): - with open(self.pid_file_path, "r") as f: - return int(f.read().strip()) + def _read_pid_from_pidfile(self) -> int | None: + try: + with open(self.pid_file_path, "r") as f: + return int(f.read().strip()) + except FileNotFoundError as e: + logging.warning("pidfile %s does not exist.", self.pid_file_path) + return None def _write_pid_to_pidfile(self): """Creates a pidfile and writes the current pid to the file. @@ -333,7 +337,23 @@ class DaemonManager: ) return True - def _remove_pidfile(self): + def _remove_pidfile(self, expected_pid: int): + recorded_pid = self._read_pid_from_pidfile() + + if recorded_pid is None: + logging.info("pid file %s already removed.", self.pid_file_path) + return + + if recorded_pid != expected_pid: + logging.warning( + "pid file contains pid from a different process, expected pid: %d," + " actual pid: %d.", + expected_pid, + recorded_pid, + ) + return + + logging.debug("removing pidfile written by process %s", expected_pid) try: os.remove(self.pid_file_path) except FileNotFoundError: @@ -378,9 +398,7 @@ class DaemonManager: uptime_end = float(f.readline().split()[0]) return ( - (total_end_time - total_start_time) - / (uptime_end - uptime_start) - * 100 + (total_end_time - total_start_time) / (uptime_end - uptime_start) * 100 ) def _get_total_cpu_time(self, pid: int) -> float: diff --git a/tools/edit_monitor/edit_monitor_integration_test.py b/tools/edit_monitor/edit_monitor_integration_test.py index 3d28274da4..f39b93667d 100644 --- a/tools/edit_monitor/edit_monitor_integration_test.py +++ b/tools/edit_monitor/edit_monitor_integration_test.py @@ -15,6 +15,7 @@ """Integration tests for Edit Monitor.""" import glob +from importlib import resources import logging import os import pathlib @@ -25,8 +26,6 @@ import sys import tempfile import time import unittest - -from importlib import resources from unittest import mock @@ -49,7 +48,8 @@ class EditMonitorIntegrationTest(unittest.TestCase): self.root_monitoring_path.mkdir() self.edit_monitor_binary_path = self._import_executable("edit_monitor") self.patch = mock.patch.dict( - os.environ, {'ENABLE_ANDROID_EDIT_MONITOR': 'true'}) + os.environ, {"ENABLE_ANDROID_EDIT_MONITOR": "true"} + ) self.patch.start() def tearDown(self): @@ -83,7 +83,21 @@ class EditMonitorIntegrationTest(unittest.TestCase): self.assertEqual(self._get_logged_events_num(), 4) - def _start_edit_monitor_process(self): + def test_start_multiple_edit_monitor_only_one_started(self): + p1 = self._start_edit_monitor_process(wait_for_observer_start=False) + p2 = self._start_edit_monitor_process(wait_for_observer_start=False) + p3 = self._start_edit_monitor_process(wait_for_observer_start=False) + + live_processes = self._get_live_processes([p1, p2, p3]) + + # Cleanup all live processes. + for p in live_processes: + os.kill(p.pid, signal.SIGINT) + p.communicate() + + self.assertEqual(len(live_processes), 1) + + def _start_edit_monitor_process(self, wait_for_observer_start=True): command = f""" export TMPDIR="{self.working_dir.name}" {self.edit_monitor_binary_path} --path={self.root_monitoring_path} --dry_run""" @@ -94,7 +108,9 @@ class EditMonitorIntegrationTest(unittest.TestCase): start_new_session=True, executable="/bin/bash", ) - self._wait_for_observer_start(time_out=5) + if wait_for_observer_start: + self._wait_for_observer_start(time_out=5) + return p def _wait_for_observer_start(self, time_out): @@ -125,6 +141,18 @@ class EditMonitorIntegrationTest(unittest.TestCase): return 0 + def _get_live_processes(self, processes): + live_processes = [] + for p in processes: + try: + p.wait(timeout=5) + except subprocess.TimeoutExpired as e: + live_processes.append(p) + logging.info("process: %d still alive.", p.pid) + else: + logging.info("process: %d stopped.", p.pid) + return live_processes + def _import_executable(self, executable_name: str) -> pathlib.Path: binary_dir = pathlib.Path(self.working_dir.name).joinpath("binary") binary_dir.mkdir() diff --git a/tools/edit_monitor/main.py b/tools/edit_monitor/main.py index 49385f1cc5..7ca0daa7c5 100644 --- a/tools/edit_monitor/main.py +++ b/tools/edit_monitor/main.py @@ -72,7 +72,8 @@ def configure_logging(verbose=False): root_logging_dir = tempfile.mkdtemp(prefix='edit_monitor_') _, log_path = tempfile.mkstemp(dir=root_logging_dir, suffix='.log') - log_fmt = '%(asctime)s %(filename)s:%(lineno)s:%(levelname)s: %(message)s' + + log_fmt = '%(asctime)s.%(msecs)03d %(filename)s:%(lineno)s:%(levelname)s: %(message)s' date_fmt = '%Y-%m-%d %H:%M:%S' log_level = logging.DEBUG if verbose else logging.INFO |