合成优化技能
针对Vivado和Quartus工具的FPGA合成优化专家技能。提供深入的专业知识,包括合成报告分析、属性应用、资源推断控制和QoR(结果质量)改进。
概览
合成优化技能使得FPGA设计的综合优化全面实现,支持:
- 综合报告分析和资源利用审查
- 综合属性(保持、max_fanout、ram_style等)
- DSP和BRAM推断指导
- FSM编码优化(one-hot、二进制、Gray)
- 重定时和寄存器平衡
- 逻辑优化策略
- 高扇出网减少
- 多供应商综合流程
能力
1. 综合报告分析
解析和分析综合报告:
# Vivado综合报告分析
report_utilization -hierarchical
report_utilization -cells [get_cells -hier -filter {IS_PRIMITIVE}]
report_timing_summary -setup -hold
# 资源利用分解
report_utilization -format csv -file utilization.csv
# 检查特定资源类型
report_utilization -cells [get_cells -hier -filter {REF_NAME =~ DSP*}]
report_utilization -cells [get_cells -hier -filter {REF_NAME =~ RAM*}]
2. 综合属性
应用Xilinx/Vivado综合属性:
// 保持层次结构以进行调试
(* KEEP_HIERARCHY = "yes" *)
module critical_path_module (
// ...
);
// 防止寄存器优化
(* DONT_TOUCH = "yes" *) logic debug_reg;
// 控制寄存器复制以满足时序
(* MAX_FANOUT = 50 *) logic high_fanout_signal;
// 强制特定实现
(* KEEP = "true" *) logic preserved_signal;
// RAM样式控制
(* RAM_STYLE = "block" *) logic [7:0] large_mem [1024];
(* RAM_STYLE = "distributed" *) logic [7:0] small_mem [16];
(* RAM_STYLE = "registers" *) logic [7:0] tiny_mem [4];
(* RAM_STYLE = "ultra" *) logic [7:0] uram_mem [4096]; // UltraRAM
// ROM样式控制
(* ROM_STYLE = "block" *) logic [7:0] lookup_table [256];
// 使用DSP进行算术运算
(* USE_DSP = "yes" *) logic [47:0] mult_result;
(* USE_DSP = "no" *) logic [15:0] small_mult; // 使用fabric
// Shreg提取控制
(* SHREG_EXTRACT = "yes" *) logic [15:0] shift_reg;
(* SRL_STYLE = "register" *) logic [7:0] no_srl_shift;
// CDC同步器的异步寄存器
(* ASYNC_REG = "TRUE" *) logic [1:0] sync_reg;
// FSM编码
(* FSM_ENCODING = "one_hot" *) enum logic [3:0] {
IDLE, INIT, RUN, DONE
} state;
(* FSM_ENCODING = "sequential" *) // 二进制编码
(* FSM_ENCODING = "gray" *) // Gray编码
(* FSM_ENCODING = "johnson" *) // Johnson编码
(* FSM_ENCODING = "auto" *) // 工具决定
3. Intel/Quartus属性
应用Quartus综合属性:
// RAM样式
(* ramstyle = "M20K" *) logic [7:0] intel_bram [1024];
(* ramstyle = "MLAB" *) logic [7:0] intel_lutram [32];
(* ramstyle = "logic" *) logic [7:0] intel_ff_mem [8];
(* ramstyle = "no_rw_check" *) logic [7:0] dual_port_mem [256];
// 保留信号
(* preserve *) logic keep_signal;
(* noprune *) logic unused_but_keep;
// DSP使用
(* multstyle = "dsp" *) logic [31:0] use_dsp;
(* multstyle = "logic" *) logic [7:0] use_fabric;
// 同步器
(* altera_attribute = "-name SYNCHRONIZER_IDENTIFICATION FORCED" *)
logic [1:0] altera_sync;
// 最大扇出
(* maxfan = 32 *) logic fanout_limited;
4. DSP推断优化
指导DSP48/DSP块推断:
// 优化的DSP48推断模式(乘累加)
module dsp_mac #(
parameter int A_WIDTH = 18,
parameter int B_WIDTH = 18,
parameter int P_WIDTH = 48
) (
input logic clk,
input logic rst_n,
input logic ce,
input logic signed [A_WIDTH-1:0] a,
input logic signed [B_WIDTH-1:0] b,
input logic signed [P_WIDTH-1:0] c,
input logic load, // 加载C与累加
output logic signed [P_WIDTH-1:0] p
);
// 时序管道寄存器
logic signed [A_WIDTH-1:0] a_reg;
logic signed [B_WIDTH-1:0] b_reg;
logic signed [P_WIDTH-1:0] mult_reg;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
a_reg <= '0;
b_reg <= '0;
mult_reg <= '0;
p <= '0;
end else if (ce) begin
// 输入寄存器(A和B)
a_reg <= a;
b_reg <= b;
// 乘法器寄存器(M)
mult_reg <= a_reg * b_reg;
// 累加器/输出(P)
p <= load ? c + mult_reg : p + mult_reg;
end
end
endmodule
// 防止DSP用于小乘法
(* USE_DSP = "no" *)
module small_mult (
input logic [7:0] a, b,
output logic [15:0] p
);
assign p = a * b; // 将使用fabric LUTs
endmodule
5. BRAM推断优化
确保正确的块RAM推断:
// 简单的双端口RAM(正确的推断模式)
module sdp_ram #(
parameter int DATA_WIDTH = 32,
parameter int DEPTH = 1024,
parameter int ADDR_WIDTH = $clog2(DEPTH)
) (
input logic clk,
// 写端口
input logic wr_en,
input logic [ADDR_WIDTH-1:0] wr_addr,
input logic [DATA_WIDTH-1:0] wr_data,
// 读端口
input logic [ADDR_WIDTH-1:0] rd_addr,
output logic [DATA_WIDTH-1:0] rd_data
);
(* RAM_STYLE = "block" *)
logic [DATA_WIDTH-1:0] mem [DEPTH];
// 写端口
always_ff @(posedge clk) begin
if (wr_en) begin
mem[wr_addr] <= wr_data;
end
end
// 读端口(BRAM的寄存器输出)
always_ff @(posedge clk) begin
rd_data <= mem[rd_addr];
end
endmodule
// 真正的双端口RAM
module tdp_ram #(
parameter int DATA_WIDTH = 32,
parameter int DEPTH = 1024
) (
// 端口A
input logic clk_a,
input logic en_a,
input logic wr_en_a,
input logic [$clog2(DEPTH)-1:0] addr_a,
input logic [DATA_WIDTH-1:0] wr_data_a,
output logic [DATA_WIDTH-1:0] rd_data_a,
// 端口B
input logic clk_b,
input logic en_b,
input logic wr_en_b,
input logic [$clog2(DEPTH)-1:0] addr_b,
input logic [DATA_WIDTH-1:0] wr_data_b,
output logic [DATA_WIDTH-1:0] rd_data_b
);
(* RAM_STYLE = "block" *)
logic [DATA_WIDTH-1:0] mem [DEPTH];
// 端口A
always_ff @(posedge clk_a) begin
if (en_a) begin
if (wr_en_a)
mem[addr_a] <= wr_data_a;
rd_data_a <= mem[addr_a]; // 读优先模式
end
end
// 端口B
always_ff @(posedge clk_b) begin
if (en_b) begin
if (wr_en_b)
mem[addr_b] <= wr_data_b;
rd_data_b <= mem[addr_b]; // 读优先模式
end
end
endmodule
6. FSM编码优化
选择最优的FSM编码:
// 一位有效编码(快速,更多寄存器)
(* FSM_ENCODING = "one_hot" *)
typedef enum logic [7:0] {
IDLE = 8'b00000001,
INIT = 8'b00000010,
CONFIG = 8'b00000100,
RUN = 8'b00001000,
PAUSE = 8'b00010000,
DONE = 8'b00100000,
ERROR = 8'b01000000,
RESET = 8'b10000000
} state_t;
// 二进制编码(紧凑,解码较慢)
(* FSM_ENCODING = "sequential" *)
typedef enum logic [2:0] {
S_IDLE = 3'd0,
S_INIT = 3'd1,
S_RUN = 3'd2,
S_DONE = 3'd3
} compact_state_t;
// Gray编码(低功耗,一位转换)
(* FSM_ENCODING = "gray" *)
typedef enum logic [2:0] {
G_IDLE = 3'b000,
G_INIT = 3'b001,
G_RUN = 3'b011,
G_DONE = 3'b010
} gray_state_t;
// 带有非法状态恢复的安全FSM
module safe_fsm (
input logic clk,
input logic rst_n,
input logic start,
input logic done,
output state_t state
);
state_t current_state, next_state;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n)
current_state <= IDLE;
else
current_state <= next_state;
end
always_comb begin
next_state = current_state;
case (current_state)
IDLE: if (start) next_state = INIT;
INIT: next_state = RUN;
RUN: if (done) next_state = DONE;
DONE: next_state = IDLE;
default: next_state = IDLE; // 非法状态恢复
endcase
end
assign state = current_state;
endmodule
7. 高扇出网优化
减少高扇出时序问题:
// 寄存器复制以控制扇出
module fanout_control (
input logic clk,
input logic rst_n,
input logic enable_in,
output logic [31:0] data_out
);
// 高扇出使能 - 复制以满足时序
(* MAX_FANOUT = 50 *)
logic enable_r;
// 或明确复制
logic enable_bank0, enable_bank1, enable_bank2, enable_bank3;
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
enable_r <= 1'b0;
enable_bank0 <= 1'b0;
enable_bank1 <= 1'b0;
enable_bank2 <= 1'b0;
enable_bank3 <= 1'b0;
end else begin
enable_r <= enable_in;
// 复制的使能
enable_bank0 <= enable_in;
enable_bank1 <= enable_in;
enable_bank2 <= enable_in;
enable_bank3 <= enable_in;
end
end
// 使用专用使能信号为每个银行
always_ff @(posedge clk) begin
if (enable_bank0) data_out[7:0] <= /* ... */;
if (enable_bank1) data_out[15:8] <= /* ... */;
if (enable_bank2) data_out[23:16] <= /* ... */;
if (enable_bank3) data_out[31:24] <= /* ... */;
end
endmodule
8. 重定时和流水线
应用重定时以改善时序:
// Vivado重定时属性
(* RETIMING_FORWARD = 1 *)
module forward_retiming (
input logic clk,
input logic [7:0] a, b,
output logic [15:0] result
);
// 综合可能将寄存器向前移动
logic [15:0] mult;
logic [15:0] result_r1, result_r2;
assign mult = a * b;
always_ff @(posedge clk) begin
result_r1 <= mult;
result_r2 <= result_r1;
result <= result_r2;
end
endmodule
// 手动流水线平衡
module balanced_pipeline #(
parameter int PIPELINE_STAGES = 3
) (
input logic clk,
input logic [31:0] data_in,
input logic valid_in,
output logic [31:0] data_out,
output logic valid_out
);
logic [31:0] data_pipe [PIPELINE_STAGES];
logic [PIPELINE_STAGES-1:0] valid_pipe;
always_ff @(posedge clk) begin
// 数据流水线
data_pipe[0] <= data_in;
for (int i = 1; i < PIPELINE_STAGES; i++) begin
data_pipe[i] <= data_pipe[i-1];
end
// 有效流水线
valid_pipe <= {valid_pipe[PIPELINE_STAGES-2:0], valid_in};
end
assign data_out = data_pipe[PIPELINE_STAGES-1];
assign valid_out = valid_pipe[PIPELINE_STAGES-1];
endmodule
流程集成
这项技能与以下流程集成:
| 流程 | 集成点 |
|---|---|
synthesis-optimization.js |
主要综合优化 |
timing-closure.js |
综合时序 |
place-and-route.js |
后综合优化 |
power-analysis-optimization.js |
功率感知综合 |
输出模式
{
"synthesisAnalysis": {
"resourceUtilization": {
"luts": { "used": 45000, "available": 203800, "percentage": 22.1 },
"registers": { "used": 52000, "available": 407600, "percentage": 12.8 },
"bram": { "used": 120, "available": 445, "percentage": 27.0 },
"dsp": { "used": 48, "available": 740, "percentage": 6.5 }
},
"hierarchy": [
{ "module": "processor", "luts": 25000, "regs": 30000 },
{ "module": "memory_ctrl", "luts": 10000, "regs": 12000 }
],
"criticalPaths": [
{ "from": "reg_a", "to": "reg_b", "slack": -0.5, "levels": 12 }
]
},
"optimizations": [
{ "type": "attribute", "target": "sync_reg", "attribute": "ASYNC_REG" },
{ "type": "encoding", "target": "state_fsm", "encoding": "one_hot" },
{ "type": "ramStyle", "target": "data_mem", "style": "block" }
],
"recommendations": [
"Consider pipelining multiplier at line 125",
"High fanout on enable signal (fan=500) - add MAX_FANOUT",
"Consider distributed RAM for small_mem (depth=16)"
],
"artifacts": [
"src/optimized_module.sv",
"reports/utilization.rpt",
"reports/timing_summary.rpt"
]
}
最佳实践
属性应用
- 在所有CDC同步器上使用ASYNC_REG
- 对高扇出信号应用MAX_FANOUT
- 控制RAM_STYLE以预测推断
- 谨慎使用USE_DSP以平衡资源
资源推断
- 为BRAM推断注册RAM输出
- 匹配DSP48模式以自动推断
- 使用SRL进行深移位寄存器
- 避免BRAM的异步复位
时序优化
- 为长组合路径添加流水线阶段
- 在适当时使用重定时属性
- 在模块间平衡流水线阶段
- 考虑FSM编码对时序的影响
参考资料
- Xilinx UG901: Vivado综合指南
- Xilinx UG949: UltraFast设计方法论
- Intel Quartus Prime综合指南
- Yosys Open综合手册
另见
synthesis-optimization.js- 综合优化过程timing-closure.js- 时序封闭方法论- SK-010: 放置和路由技能
- AG-007: 综合专家代理