Skip to content
This repository was archived by the owner on Oct 16, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion clusterman/draining/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,11 @@ def terminate_host(host: Host) -> None:
logger.info(f'Terminating: {host.instance_id}')
resource_group_class = RESOURCE_GROUPS[host.sender]
resource_group = resource_group_class(host.group_id)
resource_group.terminate_instances_by_id([host.instance_id])

try:
resource_group.terminate_instances_by_id([host.instance_id])
except Exception:
logger.error(f'Failed to terminate host: {host.hostname}, possibly already terminated.')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: include the actual error text instead of printing an assumption of what went wrong?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is this just for human user interactions? in case of use by non-humans, where do these logs go? are they monitored?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I can tell these logs aren't processed automatically, and they stay on local host. The script is executed by machines but the logs exist for investigation and debugging if unexpected behaviour occurs. @kawaiwanyelp should have more context on this



def main(args: argparse.Namespace) -> None:
Expand Down
14 changes: 14 additions & 0 deletions tests/draining/queue_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,20 @@ def test_terminate_host():
mock_sfr.return_value.terminate_instances_by_id.assert_called_with(['i123'])


def test_terminate_host_failure_no_crash():
mock_host = mock.Mock(instance_id='i123', sender='sfr', group_id='sfr123')

mock_sfr = mock.Mock()
mock_sfr.terminate_instances_by_id.side_effect = Exception()

with mock.patch.dict(
'clusterman.draining.queue.RESOURCE_GROUPS', {'sfr': mock_sfr}, clear=True
):
terminate_host(mock_host)
mock_sfr.assert_called_with('sfr123')
mock_sfr.return_value.terminate_instances_by_id.assert_called_with(['i123'])


def test_host_from_instance_id():
with mock.patch(
'clusterman.draining.queue.ec2_describe_instances', autospec=True,
Expand Down