云迁移规划
概述
云迁移规划涉及评估当前基础设施,设计迁移策略,执行迁移以最小化停机时间,并验证结果。支持提升和转移、重新平台化和重构方法,以实现平滑的云采用。
何时使用
- 从本地迁移到云
- 云平台整合
- 遗留系统现代化
- 降低数据中心成本
- 提高可扩展性和可用性
- 满足合规性要求
- 增强灾难恢复
- 技术刷新计划
实施示例
1. 迁移评估和规划
# 云迁移评估工具
from enum import Enum
from typing import Dict, List, Tuple
from dataclasses import dataclass
class MigrationStrategy(Enum):
LIFT_AND_SHIFT = "lift_and_shift" # 重新托管
REPLATFORM = "replatform" # 重新托管并优化
REFACTOR = "refactor" # 为云重建
REPURCHASE = "repurchase" # 切换到SaaS
RETIRE = "retire" # 停用
class ApplicationComplexity(Enum):
LOW = 1
MEDIUM = 2
HIGH = 3
@dataclass
class ApplicationAssessment:
name: str
complexity: ApplicationComplexity
dependencies: List[str]
estimated_effort: int # 天
business_criticality: int # 1-10
current_costs: float # 年度
cloud_costs_estimate: float # 年度
class CloudMigrationPlanner:
def __init__(self):
self.applications: List[ApplicationAssessment] = []
self.total_effort = 0
self.total_cost_savings = 0
def add_application(self, app: ApplicationAssessment):
"""将应用程序添加到迁移评估"""
self.applications.append(app)
def recommend_migration_strategy(self, app: ApplicationAssessment) -> MigrationStrategy:
"""基于应用程序特征推荐迁移策略"""
if app.complexity == ApplicationComplexity.LOW:
return MigrationStrategy.LIFT_AND_SHIFT
elif app.complexity == ApplicationComplexity.MEDIUM:
# 检查节省成本是否合理重构
annual_savings = app.current_costs - app.cloud_costs_estimate
refactor_cost = app.estimated_effort * 500 # 每天成本
payback_months = (refactor_cost / annual_savings) * 12 if annual_savings > 0 else float('inf')
if payback_months < 6:
return MigrationStrategy.REFACTOR
else:
return MigrationStrategy.REPLATFORM
else: # 高复杂性
# 评估现代化是否值得
if app.business_criticality >= 8:
return MigrationStrategy.REFACTOR
else:
return MigrationStrategy.RETIRE # 考虑退役
def create_migration_wave_plan(self) -> Dict:
"""创建分阶段迁移计划"""
# 按关键性和依赖关系排序
sorted_apps = sorted(
self.applications,
key=lambda x: (len(x.dependencies), -x.business_criticality)
)
waves = {
'wave_1': [], # 低风险,依赖少
'wave_2': [], # 中等风险
'wave_3': [] # 高风险或关键
}
migrated = set()
for app in sorted_apps:
# 检查依赖关系是否满足
deps_satisfied = all(dep in migrated for dep in app.dependencies)
if not deps_satisfied:
continue
if app.complexity == ApplicationComplexity.LOW:
waves['wave_1'].append(app.name)
elif app.complexity == ApplicationComplexity.MEDIUM:
waves['wave_2'].append(app.name)
else:
waves['wave_3'].append(app.name)
migrated.add(app.name)
return {
'waves': waves,
'total_applications': len(self.applications),
'migrated_count': len(migrated),
'total_effort_days': sum(app.estimated_effort for app in self.applications)
}
def calculate_roi(self) -> Dict:
"""计算迁移ROI"""
总当前成本 = sum(app.current_costs for app in self.applications)
总云成本 = sum(app.cloud_costs_estimate for app in self.applications)
年节省 = 总当前成本 - 总云成本
# 估计迁移成本
总努力 = sum(app.estimated_effort for app in self.applications)
迁移成本 = 总努力 * 250 # 每天成本
回本期 = (迁移成本 / 年节省) * 12 if 年节省 > 0 else float('inf')
return {
'total_current_costs': 总当前成本,
'total_cloud_costs': 总云成本,
'annual_savings': 年节省,
'migration_cost': 迁移成本,
'payback_months': 回本期,
'year1_savings': 年节省 - 迁移成本,
'year3_savings': (年节省 * 3) - 迁移成本
}
# 使用
planner = CloudMigrationPlanner()
app1 = ApplicationAssessment(
name="Web Frontend",
complexity=ApplicationComplexity.LOW,
dependencies=[],
estimated_effort=5,
business_criticality=7,
current_costs=50000,
cloud_costs_estimate=30000
)
app2 = ApplicationAssessment(
name="API Backend",
complexity=ApplicationComplexity.MEDIUM,
dependencies=["Database"],
estimated_effort=20,
business_criticality=9,
current_costs=80000,
cloud_costs_estimate=40000
)
app3 = ApplicationAssessment(
name="Database",
complexity=ApplicationComplexity.HIGH,
dependencies=[],
estimated_effort=30,
business_criticality=10,
current_costs=120000,
cloud_costs_estimate=80000
)
planner.add_application(app1)
planner.add_application(app2)
planner.add_application(app3)
print("迁移波计划:")
print(planner.create_migration_wave_plan())
print("
ROI分析:")
print(planner.calculate_roi())
2. 数据库迁移策略
# AWS数据库迁移服务(DMS)
aws dms create-replication-instance \
--replication-instance-identifier my-replication-instance \
--replication-instance-class dms.t3.large \
--allocated-storage 100 \
--vpc-security-group-ids sg-12345
# 创建源端点
aws dms create-endpoint \
--endpoint-identifier source-db \
--endpoint-type source \
--engine-name postgres \
--server-name source-db.example.com \
--port 5432 \
--username sourceadmin \
--password sourcepassword \
--database-name sourcedb
# 创建目标端点
aws dms create-endpoint \
--endpoint-identifier target-rds \
--endpoint-type target \
--engine-name postgres \
--server-name my-db.xyz.us-east-1.rds.amazonaws.com \
--port 5432 \
--username targetadmin \
--password targetpassword \
--database-name targetdb
# 创建迁移任务
aws dms create-replication-task \
--replication-task-identifier postgres-migration \
--source-endpoint-arn arn:aws:dms:region:account:endpoint/source-db \
--target-endpoint-arn arn:aws:dms:region:account:endpoint/target-rds \
--replication-instance-arn arn:aws:dms:region:account:rep:my-replication-instance \
--migration-type fullload \
--table-mappings file://mappings.json
# 监控迁移
aws dms describe-replication-tasks \
--filters Name=replication-task-arn,Values=arn:aws:dms:region:account:task:task-id
# 开始迁移
aws dms start-replication-task \
--replication-task-arn arn:aws:dms:region:account:task:postgres-migration \
--start-replication-task-type start-replication
3. Terraform迁移基础设施
# migration.tf
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
provider "aws" {
region = var.aws_region
}
# 迁移基础设施的VPC
resource "aws_vpc" "migration" {
cidr_block = "10.100.0.0/16"
enable_dns_hostnames = true
tags = { Name = "migration-vpc" }
}
# DMS的子网
resource "aws_subnet" "migration" {
count = 2
vpc_id = aws_vpc.migration.id
cidr_block = "10.100.${count.index}.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = { Name = "migration-subnet-${count.index}" }
}
# 复制子网组
resource "aws_dms_replication_subnet_group" "migration" {
replication_subnet_group_description = "Migration subnet group"
replication_subnet_group_id = "migration-subnet-group"
subnet_ids = aws_subnet.migration[*].id
}
# 复制实例
resource "aws_dms_replication_instance" "migration" {
allocated_storage = 100
apply_immediately = true
auto_minor_version_upgrade = true
engine_version = "3.4.5"
multi_az = true
publicly_accessible = false
replication_instance_class = "dms.c5.2xlarge"
replication_instance_id = "migration-instance"
replication_subnet_group_id = aws_dms_replication_subnet_group.migration.id
tags = { Name = "migration-instance" }
}
# 源数据库端点
resource "aws_dms_endpoint" "source" {
endpoint_type = "source"
engine_name = "postgres"
server_name = var.source_db_host
port = 5432
username = var.source_db_user
password = var.source_db_password
database_name = var.source_db_name
endpoint_id = "source-postgres"
ssl_mode = "require"
tags = { Name = "source-endpoint" }
}
# 目标RDS端点
resource "aws_dms_endpoint" "target" {
endpoint_type = "target"
engine_name = "postgres"
server_name = aws_db_instance.target.endpoint
port = 5432
username = aws_db_instance.target.username
password = var.target_db_password
database_name = aws_db_instance.target.db_name
endpoint_id = "target-rds"
tags = { Name = "target-endpoint" }
}
# 目标RDS实例
resource "aws_db_instance" "target" {
identifier = "migration-target-db"
allocated_storage = 100
engine = "postgres"
engine_version = "15.2"
instance_class = "db.r5.2xlarge"
username = "postgres"
password = random_password.db.result
db_name = "targetdb"
multi_az = true
publicly_accessible = false
backup_retention_period = 30
backup_window = "03:00-04:00"
skip_final_snapshot = false
final_snapshot_identifier = "migration-target-final-snapshot"
}
# 复制任务
resource "aws_dms_replication_task" "migration" {
migration_type = "full-load-and-cdc"
replication_instance_arn = aws_dms_replication_instance.migration.replication_instance_arn
replication_task_id = "postgres-full-migration"
source_endpoint_arn = aws_dms_endpoint.source.endpoint_arn
target_endpoint_arn = aws_dms_endpoint.target.endpoint_arn
table_mappings = jsonencode({
rules = [
{
rule_type = "selection"
rule_id = "1"
rule_action = "include"
object_locator = {
schema_name = "%"
table_name = "%"
}
}
]
})
replication_task_settings = jsonencode({
TargetMetadata = {
TargetSchema = "public"
SupportLobs = true
FullLobMode = false
LobChunkSize = 64
LobMaxSize = 32
}
FullLoadSettings = {
TargetPrepMode = "DROP_AND_CREATE"
CreatePkAfterFullLoad = false
StopTaskCachedSourceNotApplied = false
}
Logging = {
EnableLogging = true
LogComponents = [
{
LogType = "SOURCE_UNSPECIFIED"
Id = "%COMMON_MESSAGES%"
Severity = "LOGGER_SEVERITY_DEBUG"
}
]
}
})
tags = { Name = "postgres-migration" }
depends_on = [
aws_dms_endpoint.source,
aws_dms_endpoint.target,
aws_dms_replication_instance.migration
]
}
# 秘密管理器用于凭证
resource "aws_secretsmanager_secret" "migration_creds" {
name_prefix = "migration/"
}
resource "aws_secretsmanager_secret_version" "migration_creds" {
secret_id = aws_secretsmanager_secret.migration_creds.id
secret_string = jsonencode({
source_db_password = var.source_db_password
target_db_password = var.target_db_password
})
}
# CloudWatch监控
resource "aws_cloudwatch_log_group" "dms" {
name = "/aws/dms/migration"
retention_in_days = 7
}
resource "aws_cloudwatch_metric_alarm" "migration_failed" {
alarm_name = "dms-migration-failed"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "FailureCount"
namespace = "AWS/DMS"
period = 300
statistic = "Sum"
threshold = 1
alarm_description = "DMS迁移失败时发出警报"
}
# 随机密码
resource "random_password" "db" {
length = 16
special = true
}
# AZ数据源
data "aws_availability_zones" "available" {
state = "available"
}
# 输出
output "dms_instance_id" {
value = aws_dms_replication_instance.migration.replication_instance_id
}
output "target_db_endpoint" {
value = aws_db_instance.target.endpoint
}
4. 切换验证清单
# cutover-validation.yaml
pre_cutover:
- name: "源数据库健康检查"
steps:
- command: "SELECT COUNT(*) FROM pg_stat_replication;"
- validate: "复制延迟<1秒"
- expected: "所有副本同步"
- name: "目标数据库准备情况"
steps:
- command: "SELECT datname, pg_size_pretty(pg_database_size(datname)) FROM pg_database;"
- validate: "目标数据库大小与源匹配"
- expected: "完全匹配"
- name: "网络连接性"
steps:
- test: "源到目标的连接性"
- command: "nc -zv target-db.rds.amazonaws.com 5432"
- expected: "连接成功"
- name: "备份验证"
steps:
- verify: "存在最近的备份"
- test: "恢复到测试实例"
- expected: "恢复成功"
cutover:
- name: "切换前任务"
steps:
- "通知利益相关者"
- "停止应用程序写入"
- "验证复制延迟<1秒"
- "从源捕获最终指标"
- name: "DNS切换"
steps:
- "更新DNS指向目标"
- "验证DNS传播"
- "测试测试客户端的连接性"
- name: "应用程序故障转移"
steps:
- "更新连接字符串"
- "重新启动应用程序服务器"
- "验证应用程序健康"
- "运行烟雾测试"
post_cutover:
- name: "验证"
steps:
- "在生产上运行测试套件"
- "验证数据完整性"
- "检查应用程序日志"
- "监控错误率"
- name: "清理"
steps:
- "记录最终指标"
- "归档源数据库"
- "更新文档"
- "安排迁移后审查"
validation_criteria:
- "零数据丢失"
- "应用程序响应时间<200ms"
- "错误率<0.1%"
- "所有用户旅程通过"
- "数据库复制成功"
最佳实践
✅ DO
- 进行全面的发现和评估
- 过渡期间运行并行系统
- 在切换前彻底测试
- 准备回滚计划
- 迁移后密切监控
- 记录所有更改
- 培训运营团队
- 暂时保留旧系统
❌ DON’T
- 未计划就匆忙迁移
- 未测试就迁移
- 忘记回滚程序
- 忽略依赖关系
- 跳过利益相关者沟通
- 一次性迁移所有内容
- 忘记更新文档
迁移阶段
- 评估(2-4周):发现、评估、计划
- 试点(2-8周):迁移非关键应用程序
- 波次迁移(8-16周):按优先级迁移
- 优化(4+周):微调云资源
- 收尾(1-2周):停用源系统