Skip to content

Commit 9d29968

Browse files
authored
Merge pull request Xilinx#1123 from Xilinx/dev
Release merge for v0.10.1
2 parents e3087ad + f649cda commit 9d29968

84 files changed

Lines changed: 1812 additions & 1131 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/quicktest-dev-pr.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,5 @@ jobs:
2222
export FINN_ROOT=$(pwd)
2323
export FINN_BUILD_DIR=/tmp/finn_gha
2424
export FINN_INST_NAME=finn_gha
25+
export FINN_SKIP_XRT_DOWNLOAD=1
2526
./run-docker.sh quicktest

AUTHORS.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@ Contributors
3434
* Shashwat Khandelwal (@shashwat1198)
3535
* Ian Colbert (@i-colbert)
3636
* Rachit Garg (@rstar900)
37+
* Christoph Berganski (@iksnagreb)
38+
* Jonas Kuehle (@vopade)
39+
* Aditya S (@Adityasrinivas24)

docker/Dockerfile.finn

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ FROM ubuntu:jammy-20230126
3131
LABEL maintainer="Jakoba Petri-Koenig <jakoba.petri-koenig@amd.com>, Yaman Umuroglu <yaman.umuroglu@amd.com>"
3232

3333
ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"
34+
ARG SKIP_XRT
35+
ARG LOCAL_XRT
3436

3537
WORKDIR /workspace
3638

@@ -78,15 +80,19 @@ RUN cd verilator && \
7880
make install
7981

8082
# install XRT
81-
RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
82-
RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
83-
RUN rm /tmp/$XRT_DEB_VERSION.deb
83+
RUN if [ -z "$LOCAL_XRT" ] && [ -z "$SKIP_XRT" ];then \
84+
wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb; fi
85+
86+
COPY requirements.txt $XRT_DEB_VERSION.* /tmp/
87+
88+
RUN if [ -z "$SKIP_XRT" ];then \
89+
apt install -y /tmp/$XRT_DEB_VERSION.deb && \
90+
rm /tmp/$XRT_DEB_VERSION.deb; fi
8491

8592
# versioned Python package requirements for FINN compiler
8693
# these are given in requirements.txt
87-
COPY requirements.txt .
88-
RUN pip install -r requirements.txt
89-
RUN rm requirements.txt
94+
RUN pip install -r /tmp/requirements.txt
95+
RUN rm /tmp/requirements.txt
9096

9197
# install PyTorch
9298
RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
@@ -126,6 +132,9 @@ RUN pip install tokenize-rt==4.2.1
126132
# pyverilator
127133
RUN pip install tclwrapper==0.0.1
128134

135+
# assure that we have the right setuptools version
136+
RUN pip install setuptools==68.2.2
137+
129138
# extra environment variables for FINN compiler
130139
ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
131140

docker/finn_entrypoint.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then
8686
source $XILINX_XRT/setup.sh
8787
gecho "Found XRT at $XILINX_XRT"
8888
else
89-
recho "XRT not found on $XILINX_XRT, did the installation fail?"
89+
recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
9090
exit -1
9191
fi
9292
else

fetch-repos.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929

3030
QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
31-
FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2"
31+
FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
3232
BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
3333
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
3434
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"

finn-rtllib/fifo/hdl/Q_srl.v

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
7474
parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256)
7575
parameter width = 16; // - width of data (i_d, o_d)
7676

77-
parameter addrwidth = $clog2(depth);
77+
localparam countwidth = $clog2(depth + 1);
78+
localparam addrwidth = $clog2(depth);
7879

7980
input clock;
8081
input reset;
@@ -89,10 +90,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
8990
input o_r; // - output stream ready
9091
wire o_b; // - output stream back-pressure
9192

92-
output [addrwidth:0] count; // - output number of elems in queue
93-
output [addrwidth:0] maxcount; // - maximum observed count since reset
93+
output [countwidth-1:0] count; // - output number of elems in queue
94+
output [countwidth-1:0] maxcount; // - maximum observed count since reset
9495

95-
reg [addrwidth:0] maxcount_reg; // - maximum count seen until now
96+
reg [countwidth-1:0] maxcount_reg; // - maximum count seen until now
9697
reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address
9798
// for data output
9899
reg shift_en_; // - SRL16 shift enable

finn-rtllib/mvu/mvu_4sx4u.sv

Lines changed: 114 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,9 @@ module mvu_4sx4u #(
3636
int unsigned SIMD,
3737
int unsigned ACCU_WIDTH,
3838

39-
int unsigned VERSION = 1,
39+
int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS
4040
bit SIGNED_ACTIVATIONS = 0,
41+
bit NARROW_WEIGHTS = 0, // Weights from [-7:7] rather than [-8:7]
4142
bit FORCE_BEHAVIORAL = 0
4243
)(
4344
// Global Control
@@ -62,6 +63,55 @@ module mvu_4sx4u #(
6263
`endif
6364
FORCE_BEHAVIORAL;
6465

66+
//-----------------------------------------------------------------------
67+
// Determine Lane Configuration
68+
initial begin
69+
if(!NARROW_WEIGHTS && (VERSION == 1)) begin
70+
$error("%m: Need NARROW_WEIGHTS for DSP48E1.");
71+
$finish;
72+
end
73+
end
74+
75+
/**
76+
* Lane Slicing
77+
* Assumptions:
78+
* - Internal lane widths differ, at most, by a single bit.
79+
* - The rightmost lane (#0) has the maximum internal width.
80+
* - The leftmost lane (#3) extends into the wide DSP accumulation path and
81+
* is constrained by ACCU_WIDTH rather than the next lane. It doesn't have
82+
* an external high extension.
83+
* - The one but leftmost lane (#2) has the minimum internal width and, hence,
84+
* the macimum external high extension.
85+
*/
86+
typedef int unsigned lane_offset_v[4:0];
87+
function lane_offset_v sliceLanes();
88+
unique case(VERSION)
89+
1: begin
90+
return NARROW_WEIGHTS?
91+
lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } :
92+
lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported
93+
end
94+
2: begin
95+
return NARROW_WEIGHTS?
96+
lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } :
97+
lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 };
98+
end
99+
endcase
100+
endfunction : sliceLanes
101+
localparam lane_offset_v OFFSETS = sliceLanes();
102+
103+
function int unsigned lo_width(input int unsigned i);
104+
return OFFSETS[i+1] - OFFSETS[i];
105+
endfunction : lo_width
106+
function int unsigned hi_width(input int unsigned i);
107+
return 1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD);
108+
endfunction : hi_width
109+
localparam int unsigned LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0];
110+
localparam int unsigned HI_WIDTH_MAX = hi_width(2);
111+
112+
localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath
113+
114+
// Compute the count of decendents for all nodes in the reduction trees.
65115
typedef int unsigned leave_load_t[2*SIMD-1];
66116
function leave_load_t init_leave_loads();
67117
automatic leave_load_t res;
@@ -79,16 +129,14 @@ module mvu_4sx4u #(
79129
assign vld = L[5];
80130

81131
// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
82-
localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
83-
84132
localparam int unsigned PIPE_COUNT = (PE+3)/4;
85133
for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes
86134

87135
localparam int unsigned PE_BEG = 4*c;
88136
localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1);
89137
localparam int unsigned PE_REM = 4*(c+1) - PE_END;
90138

91-
uwire [57:0] p3[SIMD];
139+
uwire [47:0] p3[SIMD];
92140
uwire signed [ 1:0] h3[SIMD][3];
93141
for(genvar s = 0; s < SIMD; s++) begin : genSIMD
94142

@@ -98,10 +146,10 @@ module mvu_4sx4u #(
98146
logic [26:0] dd;
99147
logic [ 1:0] xx[3:1];
100148
if(1) begin : blkVectorize
101-
uwire [3:0] ww[PE_END - PE_BEG];
149+
uwire signed [3:0] ww[PE_END - PE_BEG];
102150
for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin
103151
assign ww[pe] = w[PE_BEG + pe][s];
104-
if(pe) begin
152+
if(pe > 0) begin
105153
if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
106154
`ifndef VERILATOR
107155
else begin
@@ -123,8 +171,19 @@ module mvu_4sx4u #(
123171
dd = '0;
124172
aa = '0;
125173
for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin
126-
dd[D[pe + PE_REM]+:3] = ww[pe];
127-
aa[D[pe + PE_REM]+ 3] = ww[pe][3];
174+
automatic int unsigned ofs = OFFSETS[pe + PE_REM];
175+
dd[ofs+:3] = ww[pe];
176+
assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin
177+
$warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment.");
178+
end
179+
180+
// The sign of the weights are generally put on the subtracted A port.
181+
// However, when coinciding with the actual sign bit position of the
182+
// multiplier input path, it also goes onto the D input. This prevents
183+
// sign extensions that may happen when a DSP primitive is auto-promoted
184+
// to a newer generation.
185+
if(ofs+3 == A_WIDTH-1) dd[ofs+3] = ww[pe][3];
186+
else aa[ofs+3] = ww[pe][3];
128187
end
129188
end
130189
end : blkVectorize
@@ -135,14 +194,15 @@ module mvu_4sx4u #(
135194
// rst can be only applied to AD and zero only to B
136195
// with the same effect as zeroing both.
137196
if(BEHAVIORAL) begin : genBehav
197+
138198
// Stage #1: Input Refine
139199
logic signed [17:0] B1 = 0;
140200
always_ff @(posedge clk) begin
141201
if(zero) B1 <= 0;
142202
else if(en) B1 <= bb;
143203
end
144204

145-
logic signed [26:0] AD1 = 0;
205+
logic signed [A_WIDTH-1:0] AD1 = 0;
146206
always_ff @(posedge clk) begin
147207
if(rst) AD1 <= 0;
148208
else if(en) AD1 <= dd - aa;
@@ -429,14 +489,14 @@ module mvu_4sx4u #(
429489
X1 <= xx;
430490
X2 <= X1;
431491
foreach(X3[i]) begin
432-
X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
492+
X3[i] <= X2[i] + (L[3]? 2'h0 : pp[OFFSETS[i]+:2]);
433493
end
434494
end
435495
end
436496

437497
// Derive actual cross-lane overflows
438498
for(genvar i = 0; i < 3; i++) begin
439-
assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
499+
assign h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1];
440500
end
441501
assign p3[s] = pp;
442502

@@ -445,48 +505,59 @@ module mvu_4sx4u #(
445505
// Stage #4: Cross-SIMD Reduction
446506

447507
// Count leaves reachable from each node
448-
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
508+
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
449509

450-
uwire signed [ACCU_WIDTH -1:0] up4;
451-
uwire signed [ACCU_WIDTH -8:0] hi4[3];
452-
uwire [$clog2(SIMD)+7:0] lo4[3];
510+
uwire signed [ACCU_WIDTH-1:0] up4;
511+
uwire signed [ HI_WIDTH_MAX-1:0] hi4[3];
512+
uwire [$clog2(SIMD)+LO_WIDTH_MAX-1:0] lo4[3];
453513
for(genvar i = 0; i < 4; i++) begin
454-
localparam int unsigned LO_WIDTH = D[i+1] - D[i];
455-
localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
456514

457515
// Conclusive high part accumulation
458-
if(i >= PE_REM && i < 3) begin : genHi
459-
// Adder Tree across all SIMD high contributions, each from [-1:1]
460-
uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree;
461-
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i];
462-
for(genvar n = 0; n < SIMD-1; n++) begin
463-
// Sum truncated to actual maximum bit width at this node
464-
uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
465-
assign tree[n] = s;
466-
end
516+
if(i < 3) begin : genHi
517+
if(i < PE_REM) assign hi4[i] = '0;
518+
else begin
519+
localparam int unsigned HI_WIDTH = hi_width(i);
520+
521+
// Adder Tree across all SIMD high contributions, each from [-1:1]
522+
uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree;
523+
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i];
524+
for(genvar n = 0; n < SIMD-1; n++) begin
525+
// Sum truncated to actual maximum bit width at this node
526+
uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
527+
assign tree[n] = s;
528+
end
529+
530+
// High Sideband Accumulation
531+
logic signed [HI_WIDTH-1:0] Hi4 = 0;
532+
always_ff @(posedge clk) begin
533+
if(rst) Hi4 <= 0;
534+
else if(en) begin
535+
automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]);
536+
assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin
537+
$error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH);
538+
$stop;
539+
end
540+
Hi4 <= h;
541+
end
542+
end
543+
assign hi4[i] = Hi4;
467544

468-
// High Sideband Accumulation
469-
logic signed [HI_WIDTH-1:0] Hi4 = 0;
470-
always_ff @(posedge clk) begin
471-
if(rst) Hi4 <= 0;
472-
else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]);
473545
end
474-
assign hi4[i] = Hi4;
475546
end : genHi
476-
else if (i < 3) begin : genHiZero
477-
assign hi4[i] = '0;
478-
end : genHiZero
479547

480-
// Conclusive low part accumulation
481-
if(i >= PE_REM) begin : blkLo
548+
// Conclusive low part accumulation (all unsigned arithmetic)
549+
if(i < PE_REM) assign lo4[i] = '0;
550+
else begin : genLo
551+
localparam int unsigned LO_WIDTH = lo_width(i);
552+
482553
// Adder Tree across all SIMD low contributions
483554
localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
484555
uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
485-
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
556+
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH];
486557
for(genvar n = 0; n < SIMD-1; n++) begin
487558
// Sum truncated to actual maximum bit width at this node
488559
localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
489-
uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
560+
uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2];
490561
assign tree[n] = s;
491562
end
492563

@@ -498,10 +569,7 @@ module mvu_4sx4u #(
498569

499570
if(i == 3) assign up4 = Lo4;
500571
else assign lo4[i] = Lo4;
501-
end : blkLo
502-
else begin : blkLoZero
503-
assign lo4[i] = '0;
504-
end : blkLoZero
572+
end : genLo
505573

506574
end
507575

@@ -511,9 +579,9 @@ module mvu_4sx4u #(
511579
if(rst) Res5 <= '{ default: 0 };
512580
else if(en) begin
513581
Res5[3] <= up4 - hi4[2];
514-
Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
515-
Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
516-
Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
582+
Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
583+
Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
584+
Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] });
517585
end
518586
end
519587

0 commit comments

Comments
 (0)