今天遇到一个诡异的问题,对某个有问题的计算节点进行疏散,结果有些虚拟机的根磁盘居然消!失!了?首先能够确定的是ceph不会自动删除,那么一定是某个地方触发了删除根磁盘的操作。

这如果发生在生产环境可是一个极其严重的问题,正好借此排查的机会梳理一下nova关于主机疏散的流程。

以下代码为N版,但大体流程相差应该不大。

根据套路(不知道套路的看以前的系列文章),定位疏散入口为api/openstack/compute/evacuate.py_evacuate函数,这里有个比较重要的就是判断是否使用了共享存储,然后调用compute/api.py中的evacuate函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def evacuate(self, context, instance, host, on_shared_storage,
admin_password=None, force=None):
......
migration = objects.Migration(context,
source_compute=instance.host,
source_node=instance.node,
instance_uuid=instance.uuid,
status='accepted',
migration_type='evacuation')
if host:
migration.dest_compute = host
migration.create()
......
return self.compute_task_api.rebuild_instance(context,
instance=instance,
new_pass=admin_password,
injected_files=None,
image_ref=None,
orig_image_ref=None,
orig_sys_metadata=None,
bdms=None,
recreate=True,
on_shared_storage=on_shared_storage,
host=host,
request_spec=request_spec,
)

这个函数首先建立一个status为accepted、type为evacuation的数据,然后看是否指定目标主机,在我们的使用场景下这个值都为None,这里需要注意一个recreate参数,在疏散调用时值为True

然后跟进conductor/manager.pyComputeTaskManager类的rebuild_instance方法,这里会进行主机筛选:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
injected_files, new_pass, orig_sys_metadata,
bdms, recreate, on_shared_storage,
preserve_ephemeral=False, host=None,
request_spec=None, force_dest=None, filter_properties=None):
dest_filter_properties = filter_properties
with compute_utils.EventReporter(context, 'rebuild_server',
instance.uuid):
node = limits = None
if not host:
......
try:
hosts = self._schedule_instances(context, request_spec, filter_properties)
host_dict = hosts.pop(0)
host, node, limits = (host_dict['host'],
host_dict['nodename'],
host_dict['limits'])
except exception.NoValidHost as ex:
with excutils.save_and_reraise_exception():
self._set_vm_state_and_notify(context, instance.uuid,
'rebuild_server',
{'vm_state': instance.vm_state,
'task_state': None}, ex, request_spec)
LOG.warning(_LW("No valid host found for rebuild"),
instance=instance)
......
try:
migration = objects.Migration.get_by_instance_and_status(
context, instance.uuid, 'accepted')
except exception.MigrationNotFoundByStatus:
LOG.debug("No migration record for the rebuild/evacuate "
"request.", instance=instance)
migration = None
self.compute_rpcapi.rebuild_instance(context,
instance=instance,
new_pass=new_pass,
injected_files=injected_files,
image_ref=image_ref,
orig_image_ref=orig_image_ref,
orig_sys_metadata=orig_sys_metadata,
bdms=bdms,
recreate=recreate,
on_shared_storage=on_shared_storage,
preserve_ephemeral=preserve_ephemeral,
migration=migration,
host=host, node=node, limits=limits)

然后再经过compute/rpcapi.pyComputeAPIrebuild_instance方法,最终进入compute/manager.py_do_rebuild_instance函数,这个函数中进行主机状态、网络、磁盘相关的修改,构造好参数后进入_rebuild_default_impl函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def _rebuild_default_impl(self, context, instance, image_meta,
injected_files, admin_password, bdms,
detach_block_devices, attach_block_devices,
network_info=None,
recreate=False, block_device_info=None,
preserve_ephemeral=False):
if preserve_ephemeral:
# The default code path does not support preserving ephemeral
# partitions.
raise exception.PreserveEphemeralNotSupported()

if recreate:
detach_block_devices(context, bdms)
else:
self._power_off_instance(context, instance, clean_shutdown=True)
detach_block_devices(context, bdms)
self.driver.destroy(context, instance,
network_info=network_info,
block_device_info=block_device_info)

instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
instance.save(expected_task_state=[task_states.REBUILDING])

new_block_device_info = attach_block_devices(context, instance, bdms)

instance.task_state = task_states.REBUILD_SPAWNING
instance.save(
expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])

with instance.mutated_migration_context():
self.driver.spawn(context, instance, image_meta, injected_files,
admin_password, network_info=network_info,
block_device_info=new_block_device_info)

由于传入的recreate值为True,怎么看也不会有触发删除磁盘的行为啊?else分支的driver.destroy函数看起来很可疑,跟进去看看,代码位于virt/libvirt/driver.py中:

1
2
3
4
5
6

def destroy(self, context, instance, network_info, block_device_info=None,
destroy_disks=True, migrate_data=None):
self._destroy(instance)
self.cleanup(context, instance, network_info, block_device_info,
destroy_disks, migrate_data)

其中_destory函数主要是删除主机,代码有兴趣的可以看看。而真正执行删磁盘的代码位于cleanup函数中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def cleanup(self, context, instance, network_info, block_device_info=None,
destroy_disks=True, migrate_data=None, destroy_vifs=True):
......
try:
self._disconnect_volume(connection_info, disk_dev)
except Exception as exc:
with excutils.save_and_reraise_exception() as ctxt:
if destroy_disks:
# Don't block on Volume errors if we're trying to
# delete the instance as we may be partially created
# or deleted
ctxt.reraise = False
LOG.warning(
_LW("Ignoring Volume Error on vol %(vol_id)s "
"during delete %(exc)s"),
{'vol_id': vol.get('volume_id'), 'exc': exc},
instance=instance)
if destroy_disks:
# NOTE(haomai): destroy volumes if needed
if CONF.libvirt.images_type == 'lvm':
self._cleanup_lvm(instance, block_device_info)
if CONF.libvirt.images_type == 'rbd':
self._cleanup_rbd(instance)
......
def _cleanup_rbd(self, instance):
# NOTE(nic): On revert_resize, the cleanup steps for the root
# volume are handled with an "rbd snap rollback" command,
# and none of this is needed (and is, in fact, harmful) so
# filter out non-ephemerals from the list
if instance.task_state == task_states.RESIZE_REVERTING:
filter_fn = lambda disk: (disk.startswith(instance.uuid) and
disk.endswith('disk.local'))
else:
filter_fn = lambda disk: disk.startswith(instance.uuid)
LibvirtDriver._get_rbd_driver().cleanup_volumes(filter_fn)

# Roy注:下面这函数在virt/libvirt/storage/rbd_utils.py中
def cleanup_volumes(self, filter_fn):
with RADOSClient(self, self.pool) as client:
volumes = RbdProxy().list(client.ioctx)
for volume in filter(filter_fn, volumes):
self._destroy_volume(client, volume)

至此,能够触发删除根磁盘的代码可以确定了。这里我们倒过来想,如果疏散行为不会导致磁盘被删除,那么一定是其他地方调用了这个函数呢?经过排查,所有会调用这个destroy函数的地方有4个:

  1. evacuate
  2. revert_resize
  3. shelve_offloading
  4. host_init

其中1、2、3都被排除,那么就看host_init时做了什么吧:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def init_host(self):
"""Initialization for a standalone compute service."""
......
try:
# checking that instance was not already evacuated to other host
self._destroy_evacuated_instances(context)
for instance in instances:
self._init_instance(context, instance)
finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()
self._update_scheduler_instance_info(context, instances)


def _destroy_evacuated_instances(self, context):
"""Destroys evacuated instances.

While nova-compute was down, the instances running on it could be
evacuated to another host. Check that the instances reported
by the driver are still associated with this host. If they are
not, destroy them, with the exception of instances which are in
the MIGRATING, RESIZE_MIGRATING, RESIZE_MIGRATED, RESIZE_FINISH
task state or RESIZED vm state.
"""
filters = {
'source_compute': self.host,
'status': ['accepted', 'done'],
'migration_type': 'evacuation',
}
evacuations = objects.MigrationList.get_by_filters(context, filters)
if not evacuations:
return
evacuations = {mig.instance_uuid: mig for mig in evacuations}

filters = {'deleted': False}
local_instances = self._get_instances_on_driver(context, filters)
evacuated = [inst for inst in local_instances
if inst.uuid in evacuations]
for instance in evacuated:
migration = evacuations[instance.uuid]
LOG.info(_LI('Deleting instance as it has been evacuated from '
'this host'), instance=instance)
try:
network_info = self.network_api.get_instance_nw_info(
context, instance)
bdi = self._get_instance_block_device_info(context,
instance)
destroy_disks = not (self._is_instance_storage_shared(
context, instance))
except exception.InstanceNotFound:
network_info = network_model.NetworkInfo()
bdi = {}
LOG.info(_LI('Instance has been marked deleted already, '
'removing it from the hypervisor.'),
instance=instance)
# always destroy disks if the instance was deleted
destroy_disks = True
self.driver.destroy(context, instance,
network_info,
bdi, destroy_disks)
migration.status = 'completed'
migration.save()

会不会是疏散过程中触发了异常然后又重启了计算节点才导致的根磁盘被删除呢?

后来和同事商讨后发现我的思路错了,被上面driver.destroy这个名称误导,其实还有一个地方会进行删除磁盘行为,就是spwan函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def spawn(self, context, instance, image_meta, injected_files,
admin_password, network_info=None, block_device_info=None):
disk_info = blockinfo.get_disk_info(CONF.libvirt.virt_type,
instance,
image_meta,
block_device_info)
...
xml = self._get_guest_xml(context, instance, network_info,
disk_info, image_meta,
block_device_info=block_device_info)
self._create_domain_and_network(
context, xml, instance, network_info, disk_info,
block_device_info=block_device_info,
post_xml_callback=gen_confdrive,
destroy_disks_on_failure=True)
...

def _create_domain_and_network(self, context, xml, instance, network_info,
disk_info, block_device_info=None,
power_on=True, reboot=False,
vifs_already_plugged=False,
post_xml_callback=None,
destroy_disks_on_failure=False):

block_device_mapping = driver.block_device_info_get_mapping(
block_device_info)
...
except Exception:
# Any other error, be sure to clean up
LOG.error(_LE('Failed to start libvirt guest'),
instance=instance)
with excutils.save_and_reraise_exception():
self._cleanup_failed_start(context, instance, network_info,
block_device_info, guest,
destroy_disks_on_failure)

def _cleanup_failed_start(self, context, instance, network_info,
block_device_info, guest, destroy_disks):
try:
if guest and guest.is_active():
guest.poweroff()
finally:
self.cleanup(context, instance, network_info=network_info,
block_device_info=block_device_info,
destroy_disks=destroy_disks)

注意spwan调用_create_domain_and_network传递的参数destroy_disks_on_failure=True,如果发生异常则会进行关机以及执行cleanup函数且传递的destroy_disks=True

在我们这个场景下,如果在nova.conf设置了内存超分比,而实际内存又不足的情况下,就会导致疏散失败根磁盘被删除的情况。看了一下目前s版的nova依然会存在这个问题。

后来反思,当找到删除磁盘的操作在cleanup函数中逆向查找调用时,居然鬼使神差的认为其他地方会调用destroy函数,忽略了其他地方调用cleanup函数的可能性,浪费了一定时间,下次调试时还是要一步一步回推才行。