diff options
author | 2024-10-14 20:32:53 +0000 | |
---|---|---|
committer | 2024-10-24 16:36:09 +0000 | |
commit | ba64f31b5698b3c7440f1b6f4df1bfb202a208b0 (patch) | |
tree | 60cb597b42e8b11bceed2487e783542838f17903 | |
parent | f64e2f84330dd61250168206ff3dfdeb755efa49 (diff) |
Report edit monitor error to clearcut
Report unexpected errors to clearcut when the daemon manager detect
anything wrong when start/monitor/stop/reboot the edit monitor process.
Test: atest daemon_manager_test
Bug: 365617369
Change-Id: I2027229c387ce2525d1f730d06483032662ffc4b
-rw-r--r-- | tools/edit_monitor/daemon_manager.py | 58 | ||||
-rw-r--r-- | tools/edit_monitor/daemon_manager_test.py | 120 |
2 files changed, 152 insertions, 26 deletions
diff --git a/tools/edit_monitor/daemon_manager.py b/tools/edit_monitor/daemon_manager.py index 4ff4ec87cf..29746e47c4 100644 --- a/tools/edit_monitor/daemon_manager.py +++ b/tools/edit_monitor/daemon_manager.py @@ -13,17 +13,22 @@ # limitations under the License. +import getpass import hashlib import logging import multiprocessing import os import pathlib +import platform import signal import subprocess import sys import tempfile import time +from atest.metrics import clearcut_client +from atest.proto import clientanalytics_pb2 +from proto import edit_event_pb2 DEFAULT_PROCESS_TERMINATION_TIMEOUT_SECONDS = 1 DEFAULT_MONITOR_INTERVAL_SECONDS = 5 @@ -31,6 +36,9 @@ DEFAULT_MEMORY_USAGE_THRESHOLD = 2000 DEFAULT_CPU_USAGE_THRESHOLD = 200 DEFAULT_REBOOT_TIMEOUT_SECONDS = 60 * 60 * 24 BLOCK_SIGN_FILE = "edit_monitor_block_sign" +# Enum of the Clearcut log source defined under +# /google3/wireless/android/play/playlog/proto/log_source_enum.proto +LOG_SOURCE = 2524 def default_daemon_target(): @@ -46,11 +54,16 @@ class DaemonManager: binary_path: str, daemon_target: callable = default_daemon_target, daemon_args: tuple = (), + cclient: clearcut_client.Clearcut | None = None, ): self.binary_path = binary_path self.daemon_target = daemon_target self.daemon_args = daemon_args + self.cclient = cclient or clearcut_client.Clearcut(LOG_SOURCE) + self.user_name = getpass.getuser() + self.host_name = platform.node() + self.source_root = os.environ.get("ANDROID_BUILD_TOP", "") self.pid = os.getpid() self.daemon_process = None @@ -70,13 +83,20 @@ class DaemonManager: logging.warning("Block sign found, exiting...") return - if self.binary_path.startswith('/google/cog/'): + if self.binary_path.startswith("/google/cog/"): logging.warning("Edit monitor for cog is not supported, exiting...") return - self._stop_any_existing_instance() - self._write_pid_to_pidfile() - self._start_daemon_process() + try: + self._stop_any_existing_instance() + self._write_pid_to_pidfile() + self._start_daemon_process() + except Exception as e: + logging.exception("Failed to start daemon manager with error %s", e) + self._send_error_event_to_clearcut( + edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR + ) + raise e def monitor_daemon( self, @@ -118,6 +138,9 @@ class DaemonManager: logging.error( "Daemon process is consuming too much resource, killing..." ), + self._send_error_event_to_clearcut( + edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE + ) self._terminate_process(self.daemon_process.pid) logging.info( @@ -143,6 +166,12 @@ class DaemonManager: logging.debug("Successfully stopped daemon manager.") except Exception as e: logging.exception("Failed to stop daemon manager with error %s", e) + self._send_error_event_to_clearcut( + edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR + ) + sys.exit(1) + finally: + self.cclient.flush_events() def reboot(self): """Reboots the current process. @@ -164,6 +193,9 @@ class DaemonManager: os.execv(self.binary_path, sys.argv) except OSError as e: logging.exception("Failed to reboot process with error: %s.", e) + self._send_error_event_to_clearcut( + edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR + ) sys.exit(1) # Indicate an error occurred def cleanup(self): @@ -175,6 +207,7 @@ class DaemonManager: that requires immediate cleanup to prevent damanger to the system. """ logging.debug("Start cleaning up all existing instances.") + self._send_error_event_to_clearcut(edit_event_pb2.EditEvent.FORCE_CLEANUP) try: # First places a block sign to prevent any edit monitor process to start. @@ -355,4 +388,19 @@ class DaemonManager: except (FileNotFoundError, IOError, ValueError, TypeError): logging.exception("Failed to get pid from file path: %s", file) - return pids
\ No newline at end of file + return pids + + def _send_error_event_to_clearcut(self, error_type): + edit_monitor_error_event_proto = edit_event_pb2.EditEvent( + user_name=self.user_name, + host_name=self.host_name, + source_root=self.source_root, + ) + edit_monitor_error_event_proto.edit_monitor_error_event.CopyFrom( + edit_event_pb2.EditEvent.EditMonitorErrorEvent(error_type=error_type) + ) + log_event = clientanalytics_pb2.LogEvent( + event_time_ms=int(time.time() * 1000), + source_extension=edit_monitor_error_event_proto.SerializeToString(), + ) + self.cclient.log(log_event) diff --git a/tools/edit_monitor/daemon_manager_test.py b/tools/edit_monitor/daemon_manager_test.py index 72442c61d4..e132000bce 100644 --- a/tools/edit_monitor/daemon_manager_test.py +++ b/tools/edit_monitor/daemon_manager_test.py @@ -26,6 +26,7 @@ import time import unittest from unittest import mock from edit_monitor import daemon_manager +from proto import edit_event_pb2 TEST_BINARY_FILE = '/path/to/test_binary' @@ -133,7 +134,8 @@ class DaemonManagerTest(unittest.TestCase): def test_start_return_directly_if_in_cog_env(self): dm = daemon_manager.DaemonManager( - '/google/cog/cloud/user/workspace/edit_monitor') + '/google/cog/cloud/user/workspace/edit_monitor' + ) dm.start() # Verify no daemon process is started. self.assertIsNone(dm.daemon_process) @@ -148,9 +150,13 @@ class DaemonManagerTest(unittest.TestCase): with open(pid_file_path_dir.joinpath(TEST_PID_FILE_PATH), 'w') as f: f.write('123456') - with self.assertRaises(OSError) as error: - dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) + fake_cclient = FakeClearcutClient() + with self.assertRaises(OSError): + dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient) dm.start() + self._assert_error_event_logged( + fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR + ) def test_start_failed_to_write_pidfile(self): pid_file_path_dir = pathlib.Path(self.working_dir.name).joinpath( @@ -160,40 +166,63 @@ class DaemonManagerTest(unittest.TestCase): # Makes the directory read-only so write pidfile will fail. os.chmod(pid_file_path_dir, 0o555) - with self.assertRaises(PermissionError) as error: - dm = daemon_manager.DaemonManager(TEST_BINARY_FILE) + fake_cclient = FakeClearcutClient() + with self.assertRaises(PermissionError): + dm = daemon_manager.DaemonManager(TEST_BINARY_FILE, cclient=fake_cclient) dm.start() + self._assert_error_event_logged( + fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR + ) def test_start_failed_to_start_daemon_process(self): - with self.assertRaises(TypeError) as error: + fake_cclient = FakeClearcutClient() + with self.assertRaises(TypeError): dm = daemon_manager.DaemonManager( - TEST_BINARY_FILE, daemon_target='wrong_target', daemon_args=(1) + TEST_BINARY_FILE, + daemon_target='wrong_target', + daemon_args=(1), + cclient=fake_cclient, ) dm.start() + self._assert_error_event_logged( + fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_START_EDIT_MONITOR + ) def test_monitor_daemon_subprocess_killed_high_memory_usage(self): + fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=memory_consume_daemon_target, daemon_args=(2,), + cclient=fake_cclient, ) dm.start() dm.monitor_daemon(interval=1, memory_threshold=2) self.assertTrue(dm.max_memory_usage >= 2) self.assert_no_subprocess_running() + self._assert_error_event_logged( + fake_cclient, + edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE, + ) def test_monitor_daemon_subprocess_killed_high_cpu_usage(self): + fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( TEST_BINARY_FILE, daemon_target=cpu_consume_daemon_target, daemon_args=(20,), + cclient=fake_cclient, ) dm.start() dm.monitor_daemon(interval=1, cpu_threshold=20) self.assertTrue(dm.max_cpu_usage >= 20) self.assert_no_subprocess_running() + self._assert_error_event_logged( + fake_cclient, + edit_event_pb2.EditEvent.KILLED_DUE_TO_EXCEEDED_RESOURCE_USAGE, + ) @mock.patch('subprocess.check_output') def test_monitor_daemon_failed_does_not_matter(self, mock_output): @@ -207,7 +236,8 @@ class DaemonManagerTest(unittest.TestCase): ) dm = daemon_manager.DaemonManager( - binary_file.name, daemon_target=long_running_daemon + binary_file.name, + daemon_target=long_running_daemon, ) dm.start() dm.monitor_daemon(reboot_timeout=0.5) @@ -226,27 +256,42 @@ class DaemonManagerTest(unittest.TestCase): @mock.patch('os.kill') def test_stop_failed_to_kill_daemon_process(self, mock_kill): mock_kill.side_effect = OSError('Unknown OSError') + fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( - TEST_BINARY_FILE, daemon_target=long_running_daemon + TEST_BINARY_FILE, + daemon_target=long_running_daemon, + cclient=fake_cclient, ) - dm.start() - dm.stop() - self.assertTrue(dm.daemon_process.is_alive()) - self.assertTrue(dm.pid_file_path.exists()) + with self.assertRaises(SystemExit): + dm.start() + dm.stop() + self.assertTrue(dm.daemon_process.is_alive()) + self.assertTrue(dm.pid_file_path.exists()) + self._assert_error_event_logged( + fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR + ) @mock.patch('os.remove') def test_stop_failed_to_remove_pidfile(self, mock_remove): mock_remove.side_effect = OSError('Unknown OSError') + fake_cclient = FakeClearcutClient() dm = daemon_manager.DaemonManager( - TEST_BINARY_FILE, daemon_target=long_running_daemon + TEST_BINARY_FILE, + daemon_target=long_running_daemon, + cclient=fake_cclient, ) - dm.start() - dm.stop() - self.assert_no_subprocess_running() - self.assertTrue(dm.pid_file_path.exists()) + with self.assertRaises(SystemExit): + dm.start() + dm.stop() + self.assert_no_subprocess_running() + self.assertTrue(dm.pid_file_path.exists()) + + self._assert_error_event_logged( + fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_STOP_EDIT_MONITOR + ) @mock.patch('os.execv') def test_reboot_success(self, mock_execv): @@ -273,7 +318,7 @@ class DaemonManagerTest(unittest.TestCase): ) dm.start() - with self.assertRaises(SystemExit) as cm: + with self.assertRaises(SystemExit): dm.reboot() mock_execv.assert_not_called() self.assertEqual(cm.exception.code, 0) @@ -281,18 +326,24 @@ class DaemonManagerTest(unittest.TestCase): @mock.patch('os.execv') def test_reboot_failed(self, mock_execv): mock_execv.side_effect = OSError('Unknown OSError') + fake_cclient = FakeClearcutClient() binary_file = tempfile.NamedTemporaryFile( dir=self.working_dir.name, delete=False ) dm = daemon_manager.DaemonManager( - binary_file.name, daemon_target=long_running_daemon + binary_file.name, + daemon_target=long_running_daemon, + cclient=fake_cclient, ) dm.start() - with self.assertRaises(SystemExit) as cm: + with self.assertRaises(SystemExit): dm.reboot() self.assertEqual(cm.exception.code, 1) + self._assert_error_event_logged( + fake_cclient, edit_event_pb2.EditEvent.FAILED_TO_REBOOT_EDIT_MONITOR + ) def assert_run_simple_daemon_success(self): damone_output_file = tempfile.NamedTemporaryFile( @@ -374,6 +425,33 @@ class DaemonManagerTest(unittest.TestCase): f.write(str(p.pid)) return p + def _assert_error_event_logged(self, fake_cclient, error_type): + error_events = fake_cclient.get_sent_events() + self.assertEquals(len(error_events), 1) + self.assertEquals( + edit_event_pb2.EditEvent.FromString( + error_events[0].source_extension + ).edit_monitor_error_event.error_type, + error_type, + ) + + +class FakeClearcutClient: + + def __init__(self): + self.pending_log_events = [] + self.sent_log_event = [] + + def log(self, log_event): + self.pending_log_events.append(log_event) + + def flush_events(self): + self.sent_log_event.extend(self.pending_log_events) + self.pending_log_events.clear() + + def get_sent_events(self): + return self.sent_log_event + self.pending_log_events + if __name__ == '__main__': unittest.main() |