summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Zhuoyao Zhang <zhuoyao@google.com> 2024-12-04 17:39:54 +0000
committer Zhuoyao Zhang <zhuoyao@google.com> 2024-12-04 21:09:44 +0000
commita7107913d9ed38b23d140fe29eef1ff374663902 (patch)
tree603c7aa29ff5ce6a91989a8a7875d873bb19bf65
parent3b2cf40ee8595d3cc88b66a86dede83e56228a8d (diff)
Fix a race condition in edit monitor
When there are multiple edit monitor starting around the same time, there will be cases that the pidfile is deleted or overridden as multiple processes are trying to remove and create the pidfile around the same time (see details in b/382135550). This cl fix it by checking the pid in the pidfile and only remove the pidfile if the pid stored there is the expected one. Bug: 382135550 Test: atest edit_monitor_integration_test Change-Id: Ifdb4c29c8b36052fea83ce59de1be9d99db3a852
-rw-r--r--tools/edit_monitor/daemon_manager.py42
-rw-r--r--tools/edit_monitor/edit_monitor_integration_test.py38
-rw-r--r--tools/edit_monitor/main.py3
3 files changed, 65 insertions, 18 deletions
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py
index 0c31ab85c9..a352e30291 100644
--- a/tools/edit_monitor/daemon_manager.py
+++ b/tools/edit_monitor/daemon_manager.py
@@ -149,15 +149,15 @@ class DaemonManager:
edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_MEMORY_USAGE
)
logging.error(
- "Daemon process is consuming too much memory, rebooting...")
+ "Daemon process is consuming too much memory, rebooting..."
+ )
self.reboot()
if self.max_cpu_usage >= cpu_threshold:
self._send_error_event_to_clearcut(
edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_CPU_USAGE
)
- logging.error(
- "Daemon process is consuming too much cpu, killing...")
+ logging.error("Daemon process is consuming too much cpu, killing...")
self._terminate_process(self.daemon_process.pid)
logging.info(
@@ -179,7 +179,7 @@ class DaemonManager:
self._wait_for_process_terminate(self.daemon_process.pid, 1)
if self.daemon_process.is_alive():
self._terminate_process(self.daemon_process.pid)
- self._remove_pidfile()
+ self._remove_pidfile(self.pid)
logging.info("Successfully stopped daemon manager.")
except Exception as e:
logging.exception("Failed to stop daemon manager with error %s", e)
@@ -253,11 +253,15 @@ class DaemonManager:
if ex_pid:
logging.info("Found another instance with pid %d.", ex_pid)
self._terminate_process(ex_pid)
- self._remove_pidfile()
+ self._remove_pidfile(ex_pid)
- def _read_pid_from_pidfile(self):
- with open(self.pid_file_path, "r") as f:
- return int(f.read().strip())
+ def _read_pid_from_pidfile(self) -> int | None:
+ try:
+ with open(self.pid_file_path, "r") as f:
+ return int(f.read().strip())
+ except FileNotFoundError as e:
+ logging.warning("pidfile %s does not exist.", self.pid_file_path)
+ return None
def _write_pid_to_pidfile(self):
"""Creates a pidfile and writes the current pid to the file.
@@ -333,7 +337,23 @@ class DaemonManager:
)
return True
- def _remove_pidfile(self):
+ def _remove_pidfile(self, expected_pid: int):
+ recorded_pid = self._read_pid_from_pidfile()
+
+ if recorded_pid is None:
+ logging.info("pid file %s already removed.", self.pid_file_path)
+ return
+
+ if recorded_pid != expected_pid:
+ logging.warning(
+ "pid file contains pid from a different process, expected pid: %d,"
+ " actual pid: %d.",
+ expected_pid,
+ recorded_pid,
+ )
+ return
+
+ logging.debug("removing pidfile written by process %s", expected_pid)
try:
os.remove(self.pid_file_path)
except FileNotFoundError:
@@ -378,9 +398,7 @@ class DaemonManager:
uptime_end = float(f.readline().split()[0])
return (
- (total_end_time - total_start_time)
- / (uptime_end - uptime_start)
- * 100
+ (total_end_time - total_start_time) / (uptime_end - uptime_start) * 100
)
def _get_total_cpu_time(self, pid: int) -> float:
diff --git a/tools/edit_monitor/edit_monitor_integration_test.py b/tools/edit_monitor/edit_monitor_integration_test.py
index 3d28274da4..f39b93667d 100644
--- a/tools/edit_monitor/edit_monitor_integration_test.py
+++ b/tools/edit_monitor/edit_monitor_integration_test.py
@@ -15,6 +15,7 @@
"""Integration tests for Edit Monitor."""
import glob
+from importlib import resources
import logging
import os
import pathlib
@@ -25,8 +26,6 @@ import sys
import tempfile
import time
import unittest
-
-from importlib import resources
from unittest import mock
@@ -49,7 +48,8 @@ class EditMonitorIntegrationTest(unittest.TestCase):
self.root_monitoring_path.mkdir()
self.edit_monitor_binary_path = self._import_executable("edit_monitor")
self.patch = mock.patch.dict(
- os.environ, {'ENABLE_ANDROID_EDIT_MONITOR': 'true'})
+ os.environ, {"ENABLE_ANDROID_EDIT_MONITOR": "true"}
+ )
self.patch.start()
def tearDown(self):
@@ -83,7 +83,21 @@ class EditMonitorIntegrationTest(unittest.TestCase):
self.assertEqual(self._get_logged_events_num(), 4)
- def _start_edit_monitor_process(self):
+ def test_start_multiple_edit_monitor_only_one_started(self):
+ p1 = self._start_edit_monitor_process(wait_for_observer_start=False)
+ p2 = self._start_edit_monitor_process(wait_for_observer_start=False)
+ p3 = self._start_edit_monitor_process(wait_for_observer_start=False)
+
+ live_processes = self._get_live_processes([p1, p2, p3])
+
+ # Cleanup all live processes.
+ for p in live_processes:
+ os.kill(p.pid, signal.SIGINT)
+ p.communicate()
+
+ self.assertEqual(len(live_processes), 1)
+
+ def _start_edit_monitor_process(self, wait_for_observer_start=True):
command = f"""
export TMPDIR="{self.working_dir.name}"
{self.edit_monitor_binary_path} --path={self.root_monitoring_path} --dry_run"""
@@ -94,7 +108,9 @@ class EditMonitorIntegrationTest(unittest.TestCase):
start_new_session=True,
executable="/bin/bash",
)
- self._wait_for_observer_start(time_out=5)
+ if wait_for_observer_start:
+ self._wait_for_observer_start(time_out=5)
+
return p
def _wait_for_observer_start(self, time_out):
@@ -125,6 +141,18 @@ class EditMonitorIntegrationTest(unittest.TestCase):
return 0
+ def _get_live_processes(self, processes):
+ live_processes = []
+ for p in processes:
+ try:
+ p.wait(timeout=5)
+ except subprocess.TimeoutExpired as e:
+ live_processes.append(p)
+ logging.info("process: %d still alive.", p.pid)
+ else:
+ logging.info("process: %d stopped.", p.pid)
+ return live_processes
+
def _import_executable(self, executable_name: str) -> pathlib.Path:
binary_dir = pathlib.Path(self.working_dir.name).joinpath("binary")
binary_dir.mkdir()
diff --git a/tools/edit_monitor/main.py b/tools/edit_monitor/main.py
index 49385f1cc5..7ca0daa7c5 100644
--- a/tools/edit_monitor/main.py
+++ b/tools/edit_monitor/main.py
@@ -72,7 +72,8 @@ def configure_logging(verbose=False):
root_logging_dir = tempfile.mkdtemp(prefix='edit_monitor_')
_, log_path = tempfile.mkstemp(dir=root_logging_dir, suffix='.log')
- log_fmt = '%(asctime)s %(filename)s:%(lineno)s:%(levelname)s: %(message)s'
+
+ log_fmt = '%(asctime)s.%(msecs)03d %(filename)s:%(lineno)s:%(levelname)s: %(message)s'
date_fmt = '%Y-%m-%d %H:%M:%S'
log_level = logging.DEBUG if verbose else logging.INFO