OpenStack 二次开发实战:Nova 核心改造与生产实践

OpenStack 二次开发实战:Nova 核心改造与生产实践

开发环境搭建

DevStack(快速验证)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 最快的方式:DevStack 一键部署
git clone https://opendev.org/openstack/devstack
cd devstack

cat > local.conf << 'EOF'
[[local|localrc]]
ADMIN_PASSWORD=secret
DATABASE_PASSWORD=secret
RABBIT_PASSWORD=secret
SERVICE_PASSWORD=secret

# 只启用需要的服务
ENABLED_SERVICES=key,n-api,n-cpu,n-cond,n-sch,n-novnc
ENABLED_SERVICES+=,placement-api
ENABLED_SERVICES+=,q-svc,q-agt,q-dhcp,q-l3,q-meta
ENABLED_SERVICES+=,c-api,c-vol,c-sch
ENABLED_SERVICES+=,g-api,g-reg

# 使用本地源码(二次开发关键)
NOVA_REPO=/path/to/your/nova
NOVA_BRANCH=my-feature-branch
EOF

./stack.sh

源码调试(pdb/VSCode)

1
2
3
4
5
6
7
# 在 nova/compute/manager.py 中插入断点
import pdb; pdb.set_trace()

# 或使用 remote_pdb(不阻塞其他请求)
import remote_pdb
remote_pdb.set_trace(host='0.0.0.0', port=4444)
# 然后:telnet localhost 4444
1
2
3
4
5
6
7
8
9
10
11
// VSCode launch.json(远程调试)
{
"name": "Nova Compute",
"type": "python",
"request": "attach",
"connect": {"host": "devstack-host", "port": 5678},
"pathMappings": [{
"localRoot": "${workspaceFolder}",
"remoteRoot": "/opt/stack/nova"
}]
}

常见改造场景

1. 自定义调度策略

场景:按业务标签调度,同一业务的 VM 尽量分散到不同宿主机。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# nova/scheduler/filters/business_spread_filter.py

from nova.scheduler import filters
from nova import objects

class BusinessSpreadFilter(filters.BaseHostFilter):
"""同一 business_tag 的 VM 分散调度"""

RUN_ON_DB = False

def host_passes(self, host_state, spec_obj):
# 获取请求的业务标签
metadata = spec_obj.flavor.extra_specs
business_tag = metadata.get('business_tag')
if not business_tag:
return True # 没有标签,不限制

# 获取该宿主机上相同 business_tag 的 VM 数量
host_name = host_state.host
count = self._get_business_vm_count(host_name, business_tag)

# 超过阈值则过滤掉
max_per_host = int(metadata.get('business_max_per_host', 3))
return count < max_per_host

def _get_business_vm_count(self, host, tag):
# 查询数据库或缓存
from nova.db.main import api as db_api
# 实际实现:查询 instance_metadata 表
return db_api.count_instances_by_host_and_tag(host, tag)
1
2
3
# nova.conf
[filter_scheduler]
enabled_filters = ...,BusinessSpreadFilter

2. 虚拟机创建前后 Hook

场景:VM 创建后自动注册到 CMDB,删除后自动注销。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# nova/compute/manager.py 修改

def _build_and_run_instance(self, context, instance, ...):
try:
# 原有逻辑
self.driver.spawn(context, instance, ...)

# 创建成功后 Hook
self._post_create_hook(context, instance)

except Exception as e:
# 创建失败 Hook
self._create_failed_hook(context, instance, e)
raise

def _post_create_hook(self, context, instance):
"""VM 创建成功后的自定义逻辑"""
try:
# 注册到 CMDB
cmdb_client.register_vm(
vm_id=instance.uuid,
vm_name=instance.display_name,
host=instance.host,
project_id=instance.project_id,
ip=instance.access_ip_v4,
created_at=instance.created_at.isoformat()
)
LOG.info(f"VM {instance.uuid} 已注册到 CMDB")
except Exception as e:
# Hook 失败不影响 VM 创建
LOG.warning(f"CMDB 注册失败: {e}")

3. 扩展虚拟机 API

场景:添加自定义 API,获取 VM 的实时 CPU 使用率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# nova/api/openstack/compute/contrib/vm_metrics.py

from nova.api.openstack import extensions
from nova.api.openstack import wsgi

class VmMetricsController(wsgi.Controller):
@wsgi.action('get_metrics')
def get_metrics(self, req, id, body):
"""GET /servers/{id}/action → get_metrics"""
context = req.environ['nova.context']

# 获取 VM 实例
instance = self.compute_api.get(context, id)

# 从 libvirt 获取实时指标
metrics = self._get_vm_metrics(instance)

return {'metrics': metrics}

def _get_vm_metrics(self, instance):
# 通过 RPC 调用 nova-compute 获取指标
return self.compute_rpcapi.get_instance_metrics(
context, instance
)

4. 自定义虚拟机规格校验

场景:限制某些 Project 只能创建特定规格的 VM。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# nova/api/openstack/compute/servers.py 修改

def create(self, req, body):
context = req.environ['nova.context']
flavor_id = body['server']['flavorRef']

# 自定义校验
self._validate_flavor_for_project(context, flavor_id)

# 原有创建逻辑
return self._create(req, body)

def _validate_flavor_for_project(self, context, flavor_id):
"""检查项目是否有权限使用该规格"""
project_id = context.project_id
flavor = objects.Flavor.get_by_id(context, flavor_id)

# 从自定义配置或数据库读取限制规则
allowed_flavors = self._get_allowed_flavors(project_id)
if allowed_flavors and flavor.name not in allowed_flavors:
raise exception.FlavorNotAllowed(
flavor=flavor.name,
project=project_id
)

数据库 Schema 变更

OpenStack 使用 Alembic 管理数据库迁移:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# nova/db/main/migrations/versions/xxx_add_custom_field.py

from alembic import op
import sqlalchemy as sa

def upgrade():
op.add_column(
'instances',
sa.Column('business_tag', sa.String(255), nullable=True)
)
op.create_index(
'ix_instances_business_tag',
'instances',
['business_tag']
)

def downgrade():
op.drop_index('ix_instances_business_tag', 'instances')
op.drop_column('instances', 'business_tag')
1
2
3
4
5
# 执行迁移
nova-manage db sync

# 查看迁移状态
nova-manage db version

单元测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# nova/tests/unit/compute/test_manager.py

from unittest import mock
from nova.compute import manager
from nova import test

class TestBuildInstance(test.TestCase):
def setUp(self):
super().setUp()
self.compute = manager.ComputeManager()

@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.spawn')
@mock.patch('nova.network.neutron.API.allocate_for_instance')
def test_build_instance_success(self, mock_network, mock_spawn):
# 准备测试数据
instance = self._create_fake_instance()

# 执行
self.compute._build_and_run_instance(
self.context, instance, ...
)

# 验证
mock_spawn.assert_called_once()
mock_network.assert_called_once_with(
self.context, instance, ...
)

def test_build_instance_network_failure(self):
"""测试网络创建失败时的回滚逻辑"""
with mock.patch(
'nova.network.neutron.API.allocate_for_instance',
side_effect=Exception("Network error")
):
self.assertRaises(
exception.BuildAbortException,
self.compute._build_and_run_instance,
self.context, instance, ...
)
# 验证实例状态被设置为 ERROR
self.assertEqual('error', instance.vm_state)

版本管理与发布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 基于官方版本创建自己的分支
git clone https://github.com/openstack/nova.git
git checkout stable/2024.1 # Caracal 版本
git checkout -b mycompany/2024.1

# 开发完成后打 tag
git tag mycompany-2024.1.1-patch1

# 生成 RPM/DEB 包
python setup.py bdist_rpm
# 或使用 pbr
pip wheel . -w dist/

# 部署(替换系统包)
pip install dist/nova-*.whl --upgrade
systemctl restart openstack-nova-compute

生产改造经验总结

改造点 风险 建议
修改 nova/compute/manager.py 高(核心文件) 最小化改动,充分测试
添加自定义 Filter 推荐,影响范围小
修改数据库 Schema 只加列不删列,保持向后兼容
修改 API 响应格式 避免,会破坏客户端兼容性
添加新 API Endpoint 通过 Extension 机制添加
修改调度算法 充分压测,关注调度延迟

核心原则:最小化侵入。尽量通过配置、插件、Extension 实现需求,避免直接修改核心代码,降低升级成本。