数据复制设置Skill data-replication-setup

本文档提供了数据库复制的详细指南,包括主从复制、多主复制、复制监控和故障转移等,旨在帮助实现高可用性和灾难恢复。

数据工程 0 次安装 0 次浏览 更新于 3/3/2026

数据复制设置

概览

配置数据库复制以实现灾难恢复、负载分配和高可用性。涵盖主从复制、多主复制和监控策略。

使用场景

  • 高可用性设置
  • 灾难恢复计划
  • 只读副本配置
  • 多区域复制
  • 复制监控和维护
  • 故障转移自动化
  • 跨区域备份策略

PostgreSQL复制

主从(主备)设置

PostgreSQL - 配置主服务器:

-- 在主服务器上:postgresql.conf
-- wal_level = replica
-- max_wal_senders = 10
-- wal_keep_size = 1GB

-- 创建复制用户
CREATE ROLE replication_user WITH REPLICATION ENCRYPTED PASSWORD 'secure_password';

-- 允许复制连接:pg_hba.conf
-- host    replication     replication_user   standby_ip/32    md5

-- 启用WAL归档以进行持续备份
-- archive_mode = on
-- archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'

PostgreSQL - 设置备用服务器:

# 在备用服务器上

# 1. 如果正在运行,停止PostgreSQL
sudo systemctl stop postgresql

# 2. 从主服务器获取基础备份
pg_basebackup -h primary_ip -D /var/lib/postgresql/14/main \
  -U replication_user -v -P -W

# 3. 创建standby.signal文件
touch /var/lib/postgresql/14/main/standby.signal

# 4. 配置恢复:recovery.conf
# primary_conninfo = 'host=primary_ip user=replication_user password=password'

# 5. 启动PostgreSQL
sudo systemctl start postgresql

监控复制状态:

-- 在主服务器上:检查连接的备用服务器
SELECT pid, usename, application_name, client_addr, state
FROM pg_stat_replication;

-- 在主服务器上:检查复制延迟
SELECT slot_name, restart_lsn, confirmed_flush_lsn
FROM pg_replication_slots;

-- 在备用服务器上:检查恢复状态
SELECT pg_is_wal_replay_paused();
SELECT extract(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as replication_lag_seconds;

逻辑复制

PostgreSQL - 逻辑复制设置:

-- 在发布者(主)
CREATE PUBLICATION users_publication FOR TABLE users, orders;

-- 创建复制槽
SELECT * FROM pg_create_logical_replication_slot('users_slot', 'pgoutput');

-- 在订阅者(备用)
CREATE SUBSCRIPTION users_subscription
CONNECTION 'host=publisher_ip dbname=mydb user=repuser password=pwd'
PUBLICATION users_publication
WITH (copy_data = true);

-- 检查订阅状态
SELECT subname, subenabled, subconninfo
FROM pg_subscription;

-- 监控复制状态
SELECT slot_name, restart_lsn, confirmed_flush_lsn
FROM pg_replication_slots
WHERE slot_type = 'logical';

MySQL复制

主从设置

MySQL - 配置主服务器:

-- 在MySQL配置中(my.cnf / my.ini)
-- [mysqld]
-- server-id = 1
-- log-bin = mysql-bin
-- binlog-format = ROW

-- 创建复制用户
CREATE USER 'replication'@'%' IDENTIFIED BY 'replication_password';
GRANT REPLICATION SLAVE ON *.* TO 'replication'@'%';
FLUSH PRIVILEGES;

-- 获取二进制日志位置
SHOW MASTER STATUS;
-- 文件:mysql-bin.000001
-- 位置:154

MySQL - 配置从服务器:

-- 在MySQL配置中(my.cnf / my.ini)
-- [mysqld]
-- server-id = 2
-- relay-log = mysql-relay-bin
-- binlog-format = ROW

-- 配置复制
CHANGE MASTER TO
  MASTER_HOST = '192.168.1.100',
  MASTER_USER = 'replication',
  MASTER_PASSWORD = 'replication_password',
  MASTER_LOG_FILE = 'mysql-bin.000001',
  MASTER_LOG_POS = 154;

-- 启动复制
START SLAVE;

-- 检查从服务器状态
SHOW SLAVE STATUS\G
-- 应显示:Slave_IO_Running: Yes, Slave_SQL_Running: Yes

监控MySQL复制:

-- 检查从服务器复制状态
SHOW SLAVE STATUS\G

-- 检查复制错误
SHOW SLAVE STATUS\G
-- 查看Last_Error字段

-- 停止和恢复复制
STOP SLAVE;
-- 修复任何问题...
START SLAVE;

-- 监控复制延迟
SHOW SLAVE STATUS\G
-- 检查:Seconds_Behind_Master

多主复制

MySQL - 循环复制:

-- 服务器1(主1)
-- [mysqld]
-- server-id = 1
-- log-bin = mysql-bin
-- auto_increment_increment = 2
-- auto_increment_offset = 1

CHANGE MASTER TO
  MASTER_HOST = '192.168.1.101',
  MASTER_USER = 'replication',
  MASTER_PASSWORD = 'password',
  MASTER_LOG_FILE = 'mysql-bin.000001',
  MASTER_LOG_POS = 154;

START SLAVE;

-- 服务器2(主2)
-- [mysqld]
-- server-id = 2
-- log-bin = mysql-bin
-- auto_increment_increment = 2
-- auto_increment_offset = 2

CHANGE MASTER TO
  MASTER_HOST = '192.168.1.100',
  MASTER_USER = 'replication',
  MASTER_PASSWORD = 'password',
  MASTER_LOG_FILE = 'mysql-bin.000001',
  MASTER_LOG_POS = 154;

START SLAVE;

复制监控

PostgreSQL - 复制健康检查:

-- 创建监控函数
CREATE OR REPLACE FUNCTION check_replication_health()
RETURNS TABLE (
  slot_name name,
  restart_lsn pg_lsn,
  confirmed_flush_lsn pg_lsn,
  lag_bytes bigint,
  status text
) AS $$
BEGIN
  RETURN QUERY
  SELECT
    rs.slot_name,
    rs.restart_lsn,
    rs.confirmed_flush_lsn,
    (pg_wal_lsn_diff(pg_current_wal_lsn(), rs.confirmed_flush_lsn))::bigint,
    CASE
      WHEN pg_wal_lsn_diff(pg_current_wal_lsn(), rs.confirmed_flush_lsn) < 1048576 THEN 'HEALTHY'
      WHEN pg_wal_lsn_diff(pg_current_wal_lsn(), rs.confirmed_flush_lsn) < 10485760 THEN 'WARNING'
      ELSE 'CRITICAL'
    END
  FROM pg_replication_slots rs
  WHERE slot_type = 'physical';
END;
$$ LANGUAGE plpgsql;

SELECT * FROM check_replication_health();

MySQL - 复制延迟监控:

-- 监控多个从服务器的复制延迟
CREATE TABLE replication_monitoring (
  slave_host VARCHAR(50),
  slave_port INT,
  master_log_file VARCHAR(50),
  read_master_log_pos BIGINT,
  relay_log_file VARCHAR(50),
  relay_log_pos BIGINT,
  seconds_behind_master INT,
  checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- 插入监控数据
INSERT INTO replication_monitoring
SELECT
  @@hostname,
  @@port,
  Master_Log_File,
  Read_Master_Log_Pos,
  Relay_Log_File,
  Relay_Log_Pos,
  Seconds_Behind_Master,
  CURRENT_TIMESTAMP
FROM INFORMATION_SCHEMA.TABLES
LIMIT 1;  -- 使用SHOW SLAVE STATUS值

复制故障转移

PostgreSQL - 将备用提升为主:

# 在备用服务器上
# 将备用提升为主
pg_ctl promote -D /var/lib/postgresql/14/main

# 或使用SQL命令
SELECT pg_promote();

MySQL - 将从服务器提升为主:

-- 在从服务器上
-- 1. 停止从服务器并等待复制完成
STOP SLAVE;
SHOW SLAVE STATUS\G  -- 验证Slave_IO_Running和Slave_SQL_Running是否关闭

-- 2. 提升为主
RESET SLAVE ALL;

-- 3. 重置二进制日志
RESET MASTER;

-- 4. 旧主成为新从
-- 配置旧主为新主的从服务器
CHANGE MASTER TO
  MASTER_HOST = 'new_master_ip',
  MASTER_USER = 'replication',
  MASTER_PASSWORD = 'password',
  MASTER_AUTO_POSITION = 1;

START SLAVE;

复制配置最佳实践

PostgreSQL - postgresql.conf设置:

# WAL配置
wal_level = replica
max_wal_senders = 10
wal_keep_size = 1GB
wal_receiver_timeout = 60s
wal_receiver_status_interval = 10s

# 热备用
hot_standby = on
max_standby_streaming_delay = 3min

# 复制超时
wal_sender_timeout = 300s

MySQL - my.cnf设置:

[mysqld]
# 复制配置
server-id = 1
log-bin = mysql-bin
binlog_format = ROW
binlog-row-image = FULL

# 从服务器配置
relay-log = mysql-relay-bin
relay-log-index = mysql-relay-bin.index
relay-log-info-repository = TABLE

# 安全性
log_replica_updates = ON
slave_parallel_workers = 4
slave_parallel_type = LOGICAL_CLOCK

复制故障排除

PostgreSQL - 复制问题:

-- 检查缺失文件
SELECT slot_name, restart_lsn, wal_status
FROM pg_replication_slots;

-- 重启复制槽
SELECT pg_replication_slot_advance('slot_name', pg_current_wal_lsn());

-- 同步复制
SYNCHRONOUS_COMMIT = remote_apply;

MySQL - 常见问题:

-- 检查重复条目错误
SHOW SLAVE STATUS\G
-- 查找Last_SQL_Error

-- 跳过错误
SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
START SLAVE;

-- 重置复制
RESET SLAVE;
RESET MASTER;

复制验证

  • 首先在非生产环境中测试故障转移
  • 在复制后验证数据一致性
  • 持续监控复制延迟
  • 记录所有复制配置
  • 测试备份/恢复程序
  • 安排定期复制审计

资源