diff --git a/Makefile b/Makefile index bf928da3..10c81441 100644 --- a/Makefile +++ b/Makefile @@ -166,10 +166,6 @@ RAM_INIT_FILE ?=hello_world/hello_world.hex FPGA_TARGET ?= ORANGE-CRAB-0.21 -# FIXME: icache RAMs aren't being inferrenced as block RAMs on ECP5 -# with yosys, so make it smaller for now as a workaround. -ICACHE_NUM_LINES=4 - clkgen=fpga/clk_gen_ecp5.vhd toplevel=fpga/top-generic.vhdl dmi_dtm=dmi_dtm_dummy.vhdl @@ -227,7 +223,7 @@ LITEDRAM_GHDL_ARG=-gUSE_LITEDRAM=true endif GHDL_IMAGE_GENERICS=-gMEMORY_SIZE=$(MEMORY_SIZE) -gRAM_INIT_FILE=$(RAM_INIT_FILE) \ - -gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) -gICACHE_NUM_LINES=$(ICACHE_NUM_LINES) \ + -gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) \ $(LITEDRAM_GHDL_ARG) diff --git a/common.vhdl b/common.vhdl index 59c855ed..eefa2fd9 100644 --- a/common.vhdl +++ b/common.vhdl @@ -194,6 +194,10 @@ package common is subtype real_addr_t is std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t; + -- Minimum page size + constant MIN_LG_PGSZ : positive := 12; + constant MIN_PAGESZ : positive := 2 ** MIN_LG_PGSZ; + -- Used for tracking instruction completion and pending register writes constant TAG_COUNT : positive := 4; constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT); @@ -231,6 +235,7 @@ package common is type Fetch1ToIcacheType is record req: std_ulogic; + fetch_fail : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; big_endian : std_ulogic; @@ -238,6 +243,9 @@ package common is predicted : std_ulogic; pred_ntaken : std_ulogic; nia: std_ulogic_vector(63 downto 0); + next_nia: std_ulogic_vector(63 downto 0); + rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + next_rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); end record; type IcacheToDecode1Type is record @@ -606,7 +614,7 @@ package common is data : std_ulogic_vector(63 downto 0); end record; - type MmuToIcacheType is record + type MmuToITLBType is record tlbld : std_ulogic; tlbie : std_ulogic; doall : std_ulogic; @@ -658,7 +666,6 @@ package common is redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); last_nia: std_ulogic_vector(63 downto 0); - br_offset: std_ulogic_vector(63 downto 0); br_last: std_ulogic; br_taken: std_ulogic; abs_br: std_ulogic; @@ -672,7 +679,7 @@ package common is write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", - last_nia => (others => '0'), br_offset => (others => '0'), + last_nia => (others => '0'), br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0'), msr => (others => '0')); @@ -758,11 +765,14 @@ package common is br_nia : std_ulogic_vector(63 downto 0); br_last : std_ulogic; br_taken : std_ulogic; + interrupt : std_ulogic; + intr_vec : std_ulogic_vector(11 downto 0); end record; constant WritebackToFetch1Init : WritebackToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', mode_32bit => '0', redirect_nia => (others => '0'), - br_last => '0', br_taken => '0', br_nia => (others => '0')); + br_last => '0', br_taken => '0', br_nia => (others => '0'), + interrupt => '0', intr_vec => x"000"); type WritebackToRegisterFileType is record write_reg : gspr_index_t; diff --git a/core.vhdl b/core.vhdl index a5560690..35a860e6 100644 --- a/core.vhdl +++ b/core.vhdl @@ -57,7 +57,7 @@ architecture behave of core is signal fetch1_to_icache : Fetch1ToIcacheType; signal writeback_to_fetch1: WritebackToFetch1Type; signal icache_to_decode1 : IcacheToDecode1Type; - signal mmu_to_icache : MmuToIcacheType; + signal mmu_to_itlb : MmuToITLBType; -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; @@ -223,6 +223,7 @@ begin generic map ( RESET_ADDRESS => (others => '0'), ALT_RESET_ADDRESS => ALT_RESET_ADDRESS, + TLB_SIZE => ICACHE_TLB_SIZE, HAS_BTC => HAS_BTC ) port map ( @@ -231,8 +232,9 @@ begin alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, flush_in => fetch1_flush, - inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, + inval_btc => ex1_icache_inval or mmu_to_itlb.tlbie, stop_in => dbg_core_stop, + m_in => mmu_to_itlb, d_in => decode1_to_fetch1, w_in => writeback_to_fetch1, i_out => fetch1_to_icache, @@ -249,7 +251,6 @@ begin LINE_SIZE => 64, NUM_LINES => ICACHE_NUM_LINES, NUM_WAYS => ICACHE_NUM_WAYS, - TLB_SIZE => ICACHE_TLB_SIZE, LOG_LENGTH => LOG_LENGTH ) port map( @@ -257,7 +258,6 @@ begin rst => rst_icache, i_in => fetch1_to_icache, i_out => icache_to_decode1, - m_in => mmu_to_icache, flush_in => fetch1_flush, inval_in => dbg_icache_rst or ex1_icache_inval, stall_in => icache_stall_in, @@ -454,7 +454,7 @@ begin l_out => mmu_to_loadstore1, d_out => mmu_to_dcache, d_in => dcache_to_mmu, - i_out => mmu_to_icache + i_out => mmu_to_itlb ); dcache_0: entity work.dcache diff --git a/decode1.vhdl b/decode1.vhdl index e090d662..151977d3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -35,8 +35,7 @@ architecture behaviour of decode1 is signal f, fin : Decode1ToFetch1Type; type br_predictor_t is record - br_nia : std_ulogic_vector(61 downto 0); - br_offset : signed(23 downto 0); + br_target : signed(61 downto 0); predict : std_ulogic; end record; @@ -94,8 +93,10 @@ architecture behaviour of decode1 is INSN_andi_dot => (ALU, NONE, OP_LOGIC, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), INSN_andis_dot => (ALU, NONE, OP_LOGIC, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), INSN_attn => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_b => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), - INSN_bc => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_brel => (ALU, NONE, OP_B, CIA, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_babs => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_bcrel => (ALU, NONE, OP_BC, CIA, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_bcabs => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bcctr => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bclr => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bctar => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), @@ -310,6 +311,7 @@ architecture behaviour of decode1 is INSN_rlwimi => (ALU, NONE, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_rlwinm => (ALU, NONE, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_rlwnm => (ALU, NONE, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_rnop => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_sc => (ALU, NONE, OP_SC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_setb => (ALU, NONE, OP_SETB, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_slbia => (LDST, NONE, OP_TLBIE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -476,8 +478,6 @@ begin end if; end if; if rst = '1' then - br.br_nia <= (others => '0'); - br.br_offset <= (others => '0'); br.predict <= '0'; else br <= br_in; @@ -499,8 +499,8 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; variable vr : Decode1ToRegisterFileType; - variable br_target : std_ulogic_vector(61 downto 0); - variable br_offset : signed(23 downto 0); + variable br_nia : std_ulogic_vector(61 downto 0); + variable br_offset : std_ulogic_vector(23 downto 0); variable bv : br_predictor_t; variable icode : insn_code; variable sprn : spr_num_t; @@ -594,31 +594,28 @@ begin -- Branch predictor -- Note bclr, bcctr and bctar not predicted as we have no -- count cache or link stack. - br_offset := (others => '0'); + br_offset := f_in.insn(25 downto 2); case icode is - when INSN_b => + when INSN_brel | INSN_babs => -- Unconditional branches are always taken v.br_pred := '1'; - br_offset := signed(f_in.insn(25 downto 2)); - when INSN_bc => - -- Predict backward branches as taken, forward as untaken + when INSN_bcrel => + -- Predict backward relative branches as taken, others as untaken v.br_pred := f_in.insn(15); - br_offset := resize(signed(f_in.insn(15 downto 2)), 24); + br_offset(23 downto 14) := (others => '1'); when others => end case; - bv.br_nia := f_in.nia(63 downto 2); + br_nia := f_in.nia(63 downto 2); if f_in.insn(1) = '1' then - bv.br_nia := (others => '0'); + br_nia := (others => '0'); end if; - bv.br_offset := br_offset; + bv.br_target := signed(br_nia) + signed(br_offset); if f_in.next_predicted = '1' then v.br_pred := '1'; elsif f_in.next_pred_ntaken = '1' then v.br_pred := '0'; end if; bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; - -- after a clock edge... - br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); -- Work out GPR/FPR read addresses -- Note that for prefixed instructions we are working this out based @@ -665,7 +662,7 @@ begin d_out.decode <= decode; r_out <= vr; f_out.redirect <= br.predict; - f_out.redirect_nia <= br_target & "00"; + f_out.redirect_nia <= std_ulogic_vector(br.br_target) & "00"; flush_out <= bv.predict or br.predict; end process; diff --git a/decode2.vhdl b/decode2.vhdl index 1f3e7ffb..80dfabdc 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -221,9 +221,8 @@ architecture behaviour of decode2 is OP_SHR => "010", OP_EXTSWSLI => "010", OP_MUL_L64 => "011", -- muldiv_result - OP_B => "110", -- next_nia - OP_BC => "110", - OP_BCREG => "110", + OP_BCREG => "101", -- ramspr_result + OP_RFID => "101", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", diff --git a/decode_types.vhdl b/decode_types.vhdl index 9e7ef840..5b21fff4 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -47,14 +47,16 @@ package decode_types is INSN_andi_dot, -- 10 INSN_andis_dot, INSN_attn, - INSN_b, - INSN_bc, + INSN_brel, + INSN_babs, + INSN_bcrel, + INSN_bcabs, INSN_bcctr, INSN_bclr, INSN_bctar, - INSN_brh, + INSN_brh, -- 20 INSN_brw, - INSN_brd, -- 20 + INSN_brd, INSN_cbcdtd, INSN_cdtbcd, INSN_cmpi, @@ -62,9 +64,9 @@ package decode_types is INSN_cntlzw, INSN_cntlzd, INSN_cnttzw, - INSN_cnttzd, + INSN_cnttzd, -- 30 INSN_crand, - INSN_crandc, -- 30 + INSN_crandc, INSN_creqv, INSN_crnand, INSN_crnor, @@ -72,9 +74,9 @@ package decode_types is INSN_crorc, INSN_crxor, INSN_darn, - INSN_eieio, + INSN_eieio, -- 40 INSN_extsb, - INSN_extsh, -- 40 + INSN_extsh, INSN_extsw, INSN_extswsli, INSN_isync, @@ -82,9 +84,9 @@ package decode_types is INSN_ld, INSN_ldu, INSN_lhau, - INSN_lwa, + INSN_lwa, -- 50 INSN_lwzu, - INSN_mcrf, -- 50 + INSN_mcrf, INSN_mcrxrx, INSN_mfcr, INSN_mfmsr, @@ -92,9 +94,9 @@ package decode_types is INSN_mtcrf, INSN_mtmsr, INSN_mtmsrd, - INSN_mtspr, + INSN_mtspr, -- 60 INSN_mulli, - INSN_neg, -- 60 + INSN_neg, INSN_nop, INSN_ori, INSN_oris, @@ -102,46 +104,49 @@ package decode_types is INSN_popcntw, INSN_popcntd, INSN_prtyw, - INSN_prtyd, + INSN_prtyd, -- 70 INSN_rfid, - INSN_rldic, -- 70 + INSN_rldic, INSN_rldicl, INSN_rldicr, INSN_rldimi, INSN_rlwimi, INSN_rlwinm, + INSN_rnop, INSN_sc, - INSN_setb, + INSN_setb, -- 80 INSN_slbia, INSN_sradi, - INSN_srawi, -- 80 + INSN_srawi, INSN_stbu, INSN_std, INSN_stdu, INSN_sthu, INSN_stwu, INSN_subfic, - INSN_subfme, + INSN_subfme, -- 90 INSN_subfze, INSN_sync, - INSN_tdi, -- 90 + INSN_tdi, INSN_tlbsync, INSN_twi, INSN_wait, INSN_xori, INSN_xoris, + -- pad to 104 + INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. -- The non-prefixed versions have even indexes so that we can -- convert them to the prefixed version by setting bit 0 - INSN_addi, -- 96 + INSN_addi, -- 104 INSN_paddi, INSN_lbz, INSN_plbz, - INSN_lha, -- 100 + INSN_lha, INSN_plha, - INSN_lhz, + INSN_lhz, -- 110 INSN_plhz, INSN_lwz, INSN_plwz, @@ -149,11 +154,11 @@ package decode_types is INSN_pstb, INSN_sth, INSN_psth, - INSN_stw, -- 110 + INSN_stw, INSN_pstw, -- Slots for non-prefixed opcodes that are 8LS:D when prefixed - INSN_lhzu, -- 112 + INSN_lhzu, -- 120 INSN_plwa, INSN_op57, INSN_pld, @@ -161,8 +166,7 @@ package decode_types is INSN_pstd, -- pad to 128 to simplify comparison logic - INSN_076, INSN_077, - INSN_078, INSN_079, INSN_07a, INSN_07b, INSN_07c, INSN_07d, INSN_07e, INSN_07f, + INSN_07e, INSN_07f, -- The following instructions have an RB operand but don't access FPRs INSN_add, @@ -475,7 +479,308 @@ package decode_types is update => '0', reserve => '0', is_32bit => '0', is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0', repeat => NONE); + -- This function maps from insn_code values to primary opcode. + -- With this, we don't have to store the primary opcode of each instruction + -- in the icache if we are storing its insn_code. + function recode_primary_opcode(icode: insn_code) return std_ulogic_vector; + end decode_types; package body decode_types is + + function recode_primary_opcode(icode: insn_code) return std_ulogic_vector is + begin + case icode is + when INSN_addic => return "001100"; + when INSN_addic_dot => return "001101"; + when INSN_addi => return "001110"; + when INSN_addis => return "001111"; + when INSN_addpcis => return "010011"; + when INSN_andi_dot => return "011100"; + when INSN_andis_dot => return "011101"; + when INSN_attn => return "000000"; + when INSN_brel => return "010010"; + when INSN_babs => return "010010"; + when INSN_bcrel => return "010000"; + when INSN_bcabs => return "010000"; + when INSN_brh => return "011111"; + when INSN_brw => return "011111"; + when INSN_brd => return "011111"; + when INSN_cmpi => return "001011"; + when INSN_cmpli => return "001010"; + when INSN_lbz => return "100010"; + when INSN_lbzu => return "100011"; + when INSN_lfd => return "110010"; + when INSN_lfdu => return "110011"; + when INSN_lfs => return "110000"; + when INSN_lfsu => return "110001"; + when INSN_lha => return "101010"; + when INSN_lhau => return "101011"; + when INSN_lhz => return "101000"; + when INSN_lhzu => return "101001"; + when INSN_lwz => return "100000"; + when INSN_lwzu => return "100001"; + when INSN_mulli => return "000111"; + when INSN_nop => return "011000"; + when INSN_ori => return "011000"; + when INSN_oris => return "011001"; + when INSN_rlwimi => return "010100"; + when INSN_rlwinm => return "010101"; + when INSN_rlwnm => return "010111"; + when INSN_sc => return "010001"; + when INSN_stb => return "100110"; + when INSN_stbu => return "100111"; + when INSN_stfd => return "110110"; + when INSN_stfdu => return "110111"; + when INSN_stfs => return "110100"; + when INSN_stfsu => return "110101"; + when INSN_sth => return "101100"; + when INSN_sthu => return "101101"; + when INSN_stw => return "100100"; + when INSN_stwu => return "100101"; + when INSN_subfic => return "001000"; + when INSN_tdi => return "000010"; + when INSN_twi => return "000011"; + when INSN_xori => return "011010"; + when INSN_xoris => return "011011"; + when INSN_maddhd => return "000100"; + when INSN_maddhdu => return "000100"; + when INSN_maddld => return "000100"; + when INSN_rldic => return "011110"; + when INSN_rldicl => return "011110"; + when INSN_rldicr => return "011110"; + when INSN_rldimi => return "011110"; + when INSN_rldcl => return "011110"; + when INSN_rldcr => return "011110"; + when INSN_ld => return "111010"; + when INSN_ldu => return "111010"; + when INSN_lwa => return "111010"; + when INSN_fdivs => return "111011"; + when INSN_fsubs => return "111011"; + when INSN_fadds => return "111011"; + when INSN_fsqrts => return "111011"; + when INSN_fres => return "111011"; + when INSN_fmuls => return "111011"; + when INSN_frsqrtes => return "111011"; + when INSN_fmsubs => return "111011"; + when INSN_fmadds => return "111011"; + when INSN_fnmsubs => return "111011"; + when INSN_fnmadds => return "111011"; + when INSN_std => return "111110"; + when INSN_stdu => return "111110"; + when INSN_fdiv => return "111111"; + when INSN_fsub => return "111111"; + when INSN_fadd => return "111111"; + when INSN_fsqrt => return "111111"; + when INSN_fsel => return "111111"; + when INSN_fre => return "111111"; + when INSN_fmul => return "111111"; + when INSN_frsqrte => return "111111"; + when INSN_fmsub => return "111111"; + when INSN_fmadd => return "111111"; + when INSN_fnmsub => return "111111"; + when INSN_fnmadd => return "111111"; + when INSN_prefix => return "000001"; + when INSN_op57 => return "111001"; + when INSN_op61 => return "111101"; + when INSN_add => return "011111"; + when INSN_addc => return "011111"; + when INSN_adde => return "011111"; + when INSN_addex => return "011111"; + when INSN_addg6s => return "011111"; + when INSN_addme => return "011111"; + when INSN_addze => return "011111"; + when INSN_and => return "011111"; + when INSN_andc => return "011111"; + when INSN_bperm => return "011111"; + when INSN_cbcdtd => return "011111"; + when INSN_cdtbcd => return "011111"; + when INSN_cmp => return "011111"; + when INSN_cmpb => return "011111"; + when INSN_cmpeqb => return "011111"; + when INSN_cmpl => return "011111"; + when INSN_cmprb => return "011111"; + when INSN_cntlzd => return "011111"; + when INSN_cntlzw => return "011111"; + when INSN_cnttzd => return "011111"; + when INSN_cnttzw => return "011111"; + when INSN_darn => return "011111"; + when INSN_dcbf => return "011111"; + when INSN_dcbst => return "011111"; + when INSN_dcbt => return "011111"; + when INSN_dcbtst => return "011111"; + when INSN_dcbz => return "011111"; + when INSN_divdeu => return "011111"; + when INSN_divweu => return "011111"; + when INSN_divde => return "011111"; + when INSN_divwe => return "011111"; + when INSN_divdu => return "011111"; + when INSN_divwu => return "011111"; + when INSN_divd => return "011111"; + when INSN_divw => return "011111"; + when INSN_eieio => return "011111"; + when INSN_eqv => return "011111"; + when INSN_extsb => return "011111"; + when INSN_extsh => return "011111"; + when INSN_extsw => return "011111"; + when INSN_extswsli => return "011111"; + when INSN_icbi => return "011111"; + when INSN_icbt => return "011111"; + when INSN_isel => return "011111"; + when INSN_lbarx => return "011111"; + when INSN_lbzcix => return "011111"; + when INSN_lbzux => return "011111"; + when INSN_lbzx => return "011111"; + when INSN_ldarx => return "011111"; + when INSN_ldbrx => return "011111"; + when INSN_ldcix => return "011111"; + when INSN_ldux => return "011111"; + when INSN_ldx => return "011111"; + when INSN_lfdx => return "011111"; + when INSN_lfdux => return "011111"; + when INSN_lfiwax => return "011111"; + when INSN_lfiwzx => return "011111"; + when INSN_lfsx => return "011111"; + when INSN_lfsux => return "011111"; + when INSN_lharx => return "011111"; + when INSN_lhaux => return "011111"; + when INSN_lhax => return "011111"; + when INSN_lhbrx => return "011111"; + when INSN_lhzcix => return "011111"; + when INSN_lhzux => return "011111"; + when INSN_lhzx => return "011111"; + when INSN_lwarx => return "011111"; + when INSN_lwaux => return "011111"; + when INSN_lwax => return "011111"; + when INSN_lwbrx => return "011111"; + when INSN_lwzcix => return "011111"; + when INSN_lwzux => return "011111"; + when INSN_lwzx => return "011111"; + when INSN_mcrxrx => return "011111"; + when INSN_mfcr => return "011111"; + when INSN_mfmsr => return "011111"; + when INSN_mfspr => return "011111"; + when INSN_modud => return "011111"; + when INSN_moduw => return "011111"; + when INSN_modsd => return "011111"; + when INSN_modsw => return "011111"; + when INSN_mtcrf => return "011111"; + when INSN_mtmsr => return "011111"; + when INSN_mtmsrd => return "011111"; + when INSN_mtspr => return "011111"; + when INSN_mulhd => return "011111"; + when INSN_mulhdu => return "011111"; + when INSN_mulhw => return "011111"; + when INSN_mulhwu => return "011111"; + when INSN_mulld => return "011111"; + when INSN_mullw => return "011111"; + when INSN_nand => return "011111"; + when INSN_neg => return "011111"; + when INSN_rnop => return "011111"; + when INSN_nor => return "011111"; + when INSN_or => return "011111"; + when INSN_orc => return "011111"; + when INSN_popcntb => return "011111"; + when INSN_popcntd => return "011111"; + when INSN_popcntw => return "011111"; + when INSN_prtyd => return "011111"; + when INSN_prtyw => return "011111"; + when INSN_setb => return "011111"; + when INSN_slbia => return "011111"; + when INSN_sld => return "011111"; + when INSN_slw => return "011111"; + when INSN_srad => return "011111"; + when INSN_sradi => return "011111"; + when INSN_sraw => return "011111"; + when INSN_srawi => return "011111"; + when INSN_srd => return "011111"; + when INSN_srw => return "011111"; + when INSN_stbcix => return "011111"; + when INSN_stbcx => return "011111"; + when INSN_stbux => return "011111"; + when INSN_stbx => return "011111"; + when INSN_stdbrx => return "011111"; + when INSN_stdcix => return "011111"; + when INSN_stdcx => return "011111"; + when INSN_stdux => return "011111"; + when INSN_stdx => return "011111"; + when INSN_stfdx => return "011111"; + when INSN_stfdux => return "011111"; + when INSN_stfiwx => return "011111"; + when INSN_stfsx => return "011111"; + when INSN_stfsux => return "011111"; + when INSN_sthbrx => return "011111"; + when INSN_sthcix => return "011111"; + when INSN_sthcx => return "011111"; + when INSN_sthux => return "011111"; + when INSN_sthx => return "011111"; + when INSN_stwbrx => return "011111"; + when INSN_stwcix => return "011111"; + when INSN_stwcx => return "011111"; + when INSN_stwux => return "011111"; + when INSN_stwx => return "011111"; + when INSN_subf => return "011111"; + when INSN_subfc => return "011111"; + when INSN_subfe => return "011111"; + when INSN_subfme => return "011111"; + when INSN_subfze => return "011111"; + when INSN_sync => return "011111"; + when INSN_td => return "011111"; + when INSN_tw => return "011111"; + when INSN_tlbie => return "011111"; + when INSN_tlbiel => return "011111"; + when INSN_tlbsync => return "011111"; + when INSN_wait => return "011111"; + when INSN_xor => return "011111"; + when INSN_bcctr => return "010011"; + when INSN_bclr => return "010011"; + when INSN_bctar => return "010011"; + when INSN_crand => return "010011"; + when INSN_crandc => return "010011"; + when INSN_creqv => return "010011"; + when INSN_crnand => return "010011"; + when INSN_crnor => return "010011"; + when INSN_cror => return "010011"; + when INSN_crorc => return "010011"; + when INSN_crxor => return "010011"; + when INSN_isync => return "010011"; + when INSN_mcrf => return "010011"; + when INSN_rfid => return "010011"; + when INSN_fcfids => return "111011"; + when INSN_fcfidus => return "111011"; + when INSN_fcmpu => return "111111"; + when INSN_fcmpo => return "111111"; + when INSN_mcrfs => return "111111"; + when INSN_ftdiv => return "111111"; + when INSN_ftsqrt => return "111111"; + when INSN_mtfsb => return "111111"; + when INSN_mtfsfi => return "111111"; + when INSN_fmrgow => return "111111"; + when INSN_fmrgew => return "111111"; + when INSN_mffs => return "111111"; + when INSN_mtfsf => return "111111"; + when INSN_fcpsgn => return "111111"; + when INSN_fneg => return "111111"; + when INSN_fmr => return "111111"; + when INSN_fnabs => return "111111"; + when INSN_fabs => return "111111"; + when INSN_frin => return "111111"; + when INSN_friz => return "111111"; + when INSN_frip => return "111111"; + when INSN_frim => return "111111"; + when INSN_frsp => return "111111"; + when INSN_fctiw => return "111111"; + when INSN_fctiwu => return "111111"; + when INSN_fctid => return "111111"; + when INSN_fcfid => return "111111"; + when INSN_fctidu => return "111111"; + when INSN_fcfidu => return "111111"; + when INSN_fctiwz => return "111111"; + when INSN_fctiwuz => return "111111"; + when INSN_fctidz => return "111111"; + when INSN_fctiduz => return "111111"; + when others => return "XXXXXX"; + end case; + end; + end decode_types; diff --git a/execute1.vhdl b/execute1.vhdl index 7c1ff8f6..dacd66c7 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -95,6 +95,7 @@ architecture behaviour of execute1 is exception : std_ulogic; trap : std_ulogic; advance_nia : std_ulogic; + redir_to_next : std_ulogic; new_msr : std_ulogic_vector(63 downto 0); take_branch : std_ulogic; direct_branch : std_ulogic; @@ -124,6 +125,9 @@ architecture behaviour of execute1 is res2_sel : std_ulogic_vector(1 downto 0); spr_select : spr_id; pmu_spr_num : std_ulogic_vector(4 downto 0); + redir_to_next : std_ulogic; + advance_nia : std_ulogic; + lr_from_next : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -145,6 +149,7 @@ architecture behaviour of execute1 is prev_prefixed => '0', oe => '0', mul_select => "00", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", + redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', @@ -510,6 +515,7 @@ begin variable wr_addr : ramspr_index; variable even_wr_enab, odd_wr_enab : std_ulogic; variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0); + variable ramspr_even_data : std_ulogic_vector(63 downto 0); variable doit : std_ulogic; begin -- Read address mux and async RAM reading @@ -533,11 +539,16 @@ begin else wr_addr := ex1.ramspr_wraddr; end if; + if ex1.lr_from_next = '1' then + ramspr_even_data := next_nia; + else + ramspr_even_data := ex1.e.write_data; + end if; if interrupt_in.intr = '1' then even_wr_data := ex2.e.last_nia; odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1); else - even_wr_data := ex1.e.write_data; + even_wr_data := ramspr_even_data; odd_wr_data := ex1.ramspr_odd_data; end if; ramspr_wr_addr <= wr_addr; @@ -550,7 +561,7 @@ begin -- We assume no instruction executes in the cycle immediately following -- an interrupt, so we don't need to bypass interrupt data if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then - ramspr_even <= ex1.e.write_data; + ramspr_even <= ramspr_even_data; else ramspr_even <= even_rd_data; end if; @@ -593,7 +604,6 @@ begin shortmul_result when "011", muldiv_result when "100", ramspr_result when "101", - next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -1016,7 +1026,6 @@ begin v.e.mode_32bit := not ex1.msr(MSR_SF); v.e.instr_tag := e_in.instr_tag; v.e.last_nia := e_in.nia; - v.e.br_offset := 64x"4"; v.se.ramspr_write_even := e_in.ramspr_write_even; v.se.ramspr_write_odd := e_in.ramspr_write_odd; @@ -1114,8 +1123,6 @@ begin v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := '1'; - v.e.br_offset := b_in; - v.e.abs_br := insn_aa(e_in.insn); if e_in.br_pred = '0' then -- should never happen v.e.redirect := '1'; @@ -1129,14 +1136,13 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); - if v.take_branch = '1' then - v.e.br_offset := b_in; - v.e.abs_br := insn_aa(e_in.insn); - end if; -- Mispredicted branches cause a redirect if v.take_branch /= e_in.br_pred then v.e.redirect := '1'; end if; + if v.take_branch = '0' then + v.redir_to_next := '1'; + end if; v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := v.take_branch; @@ -1150,10 +1156,6 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); - if v.take_branch = '1' then - v.e.br_offset := ramspr_result; - v.e.abs_br := '1'; - end if; -- Indirect branches are never predicted taken v.e.redirect := v.take_branch; v.e.br_taken := v.take_branch; @@ -1177,8 +1179,6 @@ begin v.new_msr(MSR_DR) := '1'; end if; v.se.write_msr := '1'; - v.e.br_offset := ramspr_result; - v.e.abs_br := '1'; v.e.redirect := '1'; v.se.write_cfar := '1'; if HAS_FPU then @@ -1292,6 +1292,7 @@ begin when OP_ISYNC => v.e.redirect := '1'; + v.redir_to_next := '1'; when OP_ICBI => v.se.icache_inval := '1'; @@ -1406,6 +1407,7 @@ begin v.mul_select := e_in.sub_select(1 downto 0); v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; + v.lr_from_next := e_in.lr; v.ramspr_odd_data := actions.ramspr_odd_data; end if; @@ -1423,9 +1425,6 @@ begin irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - -- Next insn adder used in a couple of places - next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); - -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; @@ -1507,10 +1506,9 @@ begin x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; v.br_mispredict := v.e.redirect and actions.direct_branch; + v.advance_nia := actions.advance_nia; + v.redir_to_next := actions.redir_to_next; exception := actions.trap; - if actions.advance_nia = '1' then - v.e.last_nia := next_nia; - end if; -- Go busy while division is happening because the -- divider is not pipelined. Also go busy while a @@ -1681,6 +1679,9 @@ begin variable sign, zero : std_ulogic; variable rcnz_hi, rcnz_lo : std_ulogic; begin + -- Next insn adder used in a couple of places + next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4); + v := ex2; if stage2_stall = '0' then v.e := ex1.e; @@ -1688,6 +1689,9 @@ begin v.ext_interrupt := ex1.ext_interrupt; v.taken_branch_event := ex1.taken_branch_event; v.br_mispredict := ex1.br_mispredict; + if ex1.advance_nia = '1' then + v.e.last_nia := next_nia; + end if; end if; if ex1.se.mult_32s = '1' and ex1.oe = '1' then @@ -1748,10 +1752,12 @@ begin else sprres := pmu_to_x.spr_val; end if; - if ex1.res2_sel(1) = '0' then - ex_result := rcresult; - else + if ex1.res2_sel(1) = '1' then ex_result := sprres; + elsif ex1.redir_to_next = '1' then + ex_result := next_nia; + else + ex_result := rcresult; end if; cr_res := ex1.e.write_cr_data; diff --git a/fetch1.vhdl b/fetch1.vhdl index 6803fb65..677fa27f 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -3,12 +3,14 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.common.all; entity fetch1 is generic( RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); + TLB_SIZE : positive := 64; -- L1 ITLB number of entries (direct mapped) HAS_BTC : boolean := true ); port( @@ -21,6 +23,7 @@ entity fetch1 is inval_btc : in std_ulogic; stop_in : in std_ulogic; alt_reset_in : in std_ulogic; + m_in : in MmuToITLBType; -- redirect from writeback unit w_in : in WritebackToFetch1Type; @@ -40,14 +43,32 @@ architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; rd_is_niap4: std_ulogic; - predicted_taken: std_ulogic; - predicted_nia: std_ulogic_vector(63 downto 0); + tlbcheck: std_ulogic; + tlbstall: std_ulogic; + next_nia: std_ulogic_vector(63 downto 0); end record; + + -- Mini effective to real translation cache + type erat_t is record + epn0: std_ulogic_vector(63 - MIN_LG_PGSZ downto 0); + epn1: std_ulogic_vector(63 - MIN_LG_PGSZ downto 0); + rpn0: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + rpn1: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + priv0: std_ulogic; + priv1: std_ulogic; + valid: std_ulogic_vector(1 downto 0); + mru: std_ulogic; -- '1' => entry 1 most recently used + end record; + signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; signal advance_nia : std_ulogic; signal log_nia : std_ulogic_vector(42 downto 0); + signal erat : erat_t; + signal erat_hit : std_ulogic; + signal erat_sel : std_ulogic; + constant BTC_ADDR_BITS : integer := 10; constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; constant BTC_TARGET_BITS : integer := 62; @@ -55,43 +76,75 @@ architecture behaviour of fetch1 is constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 2; type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); + signal btc_rd_addr : unsigned(BTC_ADDR_BITS - 1 downto 0); signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); signal btc_rd_valid : std_ulogic := '0'; + -- L1 ITLB. + constant TLB_BITS : natural := log2(TLB_SIZE); + constant TLB_EA_TAG_BITS : natural := 64 - (MIN_LG_PGSZ + TLB_BITS); + constant TLB_PTE_BITS : natural := 64; + + subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; + type tlb_valids_t is array(tlb_index_t) of std_ulogic; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; + + signal itlb_valids : tlb_valids_t; + signal itlb_tags : tlb_tags_t; + signal itlb_ptes : tlb_ptes_t; + + -- Values read from above arrays on a clock edge + signal itlb_valid : std_ulogic; + signal itlb_ttag : tlb_tag_t; + signal itlb_pte : tlb_pte_t; + signal itlb_hit : std_ulogic; + + -- Privilege bit from PTE EAA field + signal eaa_priv : std_ulogic; + + -- Simple hash for direct-mapped TLB index + function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + hash := addr(MIN_LG_PGSZ + TLB_BITS - 1 downto MIN_LG_PGSZ) + xor addr(MIN_LG_PGSZ + 2 * TLB_BITS - 1 downto MIN_LG_PGSZ + TLB_BITS) + xor addr(MIN_LG_PGSZ + 3 * TLB_BITS - 1 downto MIN_LG_PGSZ + 2 * TLB_BITS); + return hash; + end; + begin regs : process(clk) begin if rising_edge(clk) then log_nia <= r.nia(63) & r.nia(43 downto 2); - if r /= r_next then + if r /= r_next and advance_nia = '1' then report "fetch1 rst:" & std_ulogic'image(rst) & " IR:" & std_ulogic'image(r_next.virt_mode) & " P:" & std_ulogic'image(r_next.priv_mode) & " E:" & std_ulogic'image(r_next.big_endian) & " 32:" & std_ulogic'image(r_next_int.mode_32bit) & + " I:" & std_ulogic'image(w_in.interrupt) & " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & - " nia:" & to_hstring(r_next.nia); + " nia:" & to_hstring(r_next.nia) & + " req:" & std_ulogic'image(r_next.req) & + " FF:" & std_ulogic'image(r_next.fetch_fail); end if; - if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then - r.virt_mode <= r_next.virt_mode; - r.priv_mode <= r_next.priv_mode; - r.big_endian <= r_next.big_endian; - r_int.mode_32bit <= r_next_int.mode_32bit; - end if; if advance_nia = '1' then - r.predicted <= r_next.predicted; - r.pred_ntaken <= r_next.pred_ntaken; - r.nia <= r_next.nia; - r_int.predicted_taken <= r_next_int.predicted_taken; - r_int.predicted_nia <= r_next_int.predicted_nia; - r_int.rd_is_niap4 <= r_next_int.rd_is_niap4; + r <= r_next; + r_int <= r_next_int; end if; -- always send the up-to-date stop mark and req r.stop_mark <= stop_in; - r.req <= not rst and not stop_in; + r.req <= r_next.req; + r.fetch_fail <= r_next.fetch_fail; + r_int.tlbcheck <= r_next_int.tlbcheck; + r_int.tlbstall <= r_next_int.tlbstall; end if; end process; log_out <= log_nia; @@ -119,15 +172,13 @@ begin variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); begin if rising_edge(clk) then - raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) + - to_unsigned(2, BTC_ADDR_BITS); if advance_nia = '1' then - if is_X(raddr) then + if is_X(btc_rd_addr) then btc_rd_data <= (others => 'X'); btc_rd_valid <= 'X'; else - btc_rd_data <= btc_memory(to_integer(raddr)); - btc_rd_valid <= btc_valids(to_integer(raddr)); + btc_rd_data <= btc_memory(to_integer(btc_rd_addr)); + btc_rd_valid <= btc_valids(to_integer(btc_rd_addr)); end if; end if; if btc_wr = '1' then @@ -144,70 +195,250 @@ begin end process; end generate; + erat_sync : process(clk) + begin + if rising_edge(clk) then + if rst /= '0' or m_in.tlbie = '1' then + erat.valid <= "00"; + erat.mru <= '0'; + else + if erat_hit = '1' then + erat.mru <= erat_sel; + end if; + if m_in.tlbld = '1' then + erat.epn0 <= m_in.addr(63 downto MIN_LG_PGSZ); + erat.rpn0 <= m_in.pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ); + erat.priv0 <= m_in.pte(3); + erat.valid(0) <= '1'; + erat.valid(1) <= '0'; + erat.mru <= '0'; + elsif r_int.tlbcheck = '1' and itlb_hit = '1' then + if erat.mru = '0' then + erat.epn1 <= r.nia(63 downto MIN_LG_PGSZ); + erat.rpn1 <= itlb_pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ); + erat.priv1 <= itlb_pte(3); + erat.valid(1) <= '1'; + else + erat.epn0 <= r.nia(63 downto MIN_LG_PGSZ); + erat.rpn0 <= itlb_pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ); + erat.priv0 <= itlb_pte(3); + erat.valid(0) <= '1'; + end if; + erat.mru <= not erat.mru; + end if; + end if; + end if; + end process; + + -- Read TLB using the NIA for the next cycle + itlb_read : process(clk) + variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + if rising_edge(clk) then + if advance_nia = '1' then + tlb_req_index := hash_ea(r_next.nia); + if is_X(tlb_req_index) then + itlb_pte <= (others => 'X'); + itlb_ttag <= (others => 'X'); + itlb_valid <= 'X'; + else + itlb_pte <= itlb_ptes(to_integer(unsigned(tlb_req_index))); + itlb_ttag <= itlb_tags(to_integer(unsigned(tlb_req_index))); + itlb_valid <= itlb_valids(to_integer(unsigned(tlb_req_index))); + end if; + end if; + end if; + end process; + + -- TLB hit detection + itlb_lookup : process(all) + begin + itlb_hit <= '0'; + if itlb_ttag = r.nia(63 downto MIN_LG_PGSZ + TLB_BITS) then + itlb_hit <= itlb_valid; + end if; + end process; + + -- iTLB update + itlb_update: process(clk) + variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + if rising_edge(clk) then + wr_index := hash_ea(m_in.addr); + if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then + -- clear all valid bits + for i in tlb_index_t loop + itlb_valids(i) <= '0'; + end loop; + elsif m_in.tlbie = '1' then + assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; + -- clear entry regardless of hit or miss + itlb_valids(to_integer(unsigned(wr_index))) <= '0'; + elsif m_in.tlbld = '1' then + assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; + itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto MIN_LG_PGSZ + TLB_BITS); + itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte; + itlb_valids(to_integer(unsigned(wr_index))) <= '1'; + end if; + --ev.itlb_miss_resolved <= m_in.tlbld and not rst; + end if; + end process; + comb : process(all) variable v : Fetch1ToIcacheType; variable v_int : reg_internal_t; + variable next_nia : std_ulogic_vector(63 downto 0); + variable m32 : std_ulogic; + variable ehit, esel : std_ulogic; + variable eaa_priv : std_ulogic; begin v := r; v_int := r_int; v.predicted := '0'; v.pred_ntaken := '0'; - v_int.predicted_taken := '0'; - v_int.rd_is_niap4 := '0'; + v.req := not stop_in; + v_int.tlbstall := r_int.tlbcheck; + v_int.tlbcheck := '0'; + + if r_int.tlbcheck = '1' and itlb_hit = '0' then + v.fetch_fail := '1'; + end if; - if rst = '1' then + -- Combinatorial computation of the CIA for the next cycle. + -- Needs to be simple so the result can be used for RAM + -- and TLB access in the icache. + -- If we are stalled, this still advances, and the assumption + -- is that it will not be used. + m32 := r_int.mode_32bit; + if w_in.redirect = '1' then + next_nia := w_in.redirect_nia(63 downto 2) & "00"; + m32 := w_in.mode_32bit; + v.virt_mode := w_in.virt_mode; + v.priv_mode := w_in.priv_mode; + v.big_endian := w_in.big_endian; + v_int.mode_32bit := w_in.mode_32bit; + v.fetch_fail := '0'; + elsif d_in.redirect = '1' then + next_nia := d_in.redirect_nia(63 downto 2) & "00"; + v.fetch_fail := '0'; + elsif r_int.tlbstall = '1' then + -- this case is needed so that the correct icache tags are read + next_nia := r.nia; + else + next_nia := r_int.next_nia; + end if; + if m32 = '1' then + next_nia(63 downto 32) := (others => '0'); + end if; + v.nia := next_nia; + + v_int.next_nia := std_ulogic_vector(unsigned(next_nia) + 4); + + -- Use v_int.next_nia as the BTC read address before it gets possibly + -- overridden with the reset or interrupt address or the predicted branch + -- target address, in order to improve timing. If it gets overridden then + -- rd_is_niap4 gets cleared to indicate that the BTC data doesn't apply. + btc_rd_addr <= unsigned(v_int.next_nia(BTC_ADDR_BITS + 1 downto 2)); + v_int.rd_is_niap4 := '1'; + + -- If the last NIA value went down with a stop mark, it didn't get + -- executed, and hence we shouldn't increment NIA. + advance_nia <= rst or w_in.interrupt or w_in.redirect or d_in.redirect or + (not r.stop_mark and not (r.req and stall_in)); + -- reduce metavalue warnings in sim + if is_X(rst) then + advance_nia <= '1'; + end if; + + -- Translate next_nia to real if possible, otherwise we have to stall + -- and look up the TLB. + ehit := '0'; + esel := '0'; + eaa_priv := '1'; + if next_nia(63 downto MIN_LG_PGSZ) = erat.epn1 and erat.valid(1) = '1' then + ehit := '1'; + esel := '1'; + end if; + if next_nia(63 downto MIN_LG_PGSZ) = erat.epn0 and erat.valid(0) = '1' then + ehit := '1'; + end if; + if v.virt_mode = '0' then + v.rpn := v.nia(REAL_ADDR_BITS - 1 downto MIN_LG_PGSZ); + eaa_priv := '1'; + elsif esel = '1' then + v.rpn := erat.rpn1; + eaa_priv := erat.priv1; + else + v.rpn := erat.rpn0; + eaa_priv := erat.priv0; + end if; + if advance_nia = '1' and ehit = '0' and v.virt_mode = '1' and + r_int.tlbcheck = '0' and v.fetch_fail = '0' then + v_int.tlbstall := '1'; + v_int.tlbcheck := '1'; + end if; + if ehit = '1' or v.virt_mode = '0' then + if eaa_priv = '1' and v.priv_mode = '0' then + v.fetch_fail := '1'; + else + v.fetch_fail := '0'; + end if; + end if; + erat_hit <= ehit and advance_nia; + erat_sel <= esel; + + if rst /= '0' then if alt_reset_in = '1' then - v.nia := ALT_RESET_ADDRESS; + v_int.next_nia := ALT_RESET_ADDRESS; else - v.nia := RESET_ADDRESS; + v_int.next_nia := RESET_ADDRESS; end if; + elsif w_in.interrupt = '1' then + v_int.next_nia := 52x"0" & w_in.intr_vec(11 downto 2) & "00"; + end if; + if rst /= '0' or w_in.interrupt = '1' then + v.req := '0'; v.virt_mode := '0'; v.priv_mode := '1'; v.big_endian := '0'; v_int.mode_32bit := '0'; - v_int.predicted_nia := (others => '0'); - elsif w_in.redirect = '1' then - v.nia := w_in.redirect_nia(63 downto 2) & "00"; - if w_in.mode_32bit = '1' then - v.nia(63 downto 32) := (others => '0'); - end if; - v.virt_mode := w_in.virt_mode; - v.priv_mode := w_in.priv_mode; - v.big_endian := w_in.big_endian; - v_int.mode_32bit := w_in.mode_32bit; - elsif d_in.redirect = '1' then - v.nia := d_in.redirect_nia(63 downto 2) & "00"; - if r_int.mode_32bit = '1' then - v.nia(63 downto 32) := (others => '0'); - end if; - elsif r_int.predicted_taken = '1' then - v.nia := r_int.predicted_nia; - elsif r.req = '1' then - v_int.rd_is_niap4 := '1'; - v.nia := std_ulogic_vector(unsigned(r.nia) + 4); - if r_int.mode_32bit = '1' then - v.nia(63 downto 32) := x"00000000"; - end if; - if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and + v_int.rd_is_niap4 := '0'; + v_int.tlbstall := '0'; + v_int.tlbcheck := '0'; + v.fetch_fail := '0'; + end if; + if v.fetch_fail = '1' then + v_int.tlbstall := '1'; + end if; + if v_int.tlbstall = '1' then + v.req := '0'; + end if; + + -- If there is a valid entry in the BTC which corresponds to the next instruction, + -- use that to predict the address of the instruction after that. + -- (w_in.redirect = '0' and d_in.redirect = '0' and r_int.tlbstall = '0') + -- implies v.nia = r_int.next_nia. + -- r_int.rd_is_niap4 implies r_int.next_nia is the address used to read the BTC. + if v.req = '1' and w_in.redirect = '0' and d_in.redirect = '0' and r_int.tlbstall = '0' and + btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS) - = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then - v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1); - v.predicted := btc_rd_data(BTC_WIDTH - 1); - v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1); + = r_int.next_nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then + v.predicted := btc_rd_data(BTC_WIDTH - 1); + v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1); + if btc_rd_data(BTC_WIDTH - 1) = '1' then + v_int.next_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; + v_int.rd_is_niap4 := '0'; end if; end if; - v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; - - -- If the last NIA value went down with a stop mark, it didn't get - -- executed, and hence we shouldn't increment NIA. - advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); r_next <= v; r_next_int <= v_int; -- Update outputs to the icache i_out <= r; + i_out.next_nia <= next_nia; + i_out.next_rpn <= v.rpn; end process; diff --git a/icache.vhdl b/icache.vhdl index e01eb359..8dfbd868 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -41,10 +41,6 @@ entity icache is NUM_LINES : positive := 32; -- Number of ways NUM_WAYS : positive := 4; - -- L1 ITLB number of entries (direct mapped) - TLB_SIZE : positive := 64; - -- L1 ITLB log_2(page_size) - TLB_LG_PGSZ : positive := 12; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -55,8 +51,6 @@ entity icache is i_in : in Fetch1ToIcacheType; i_out : out IcacheToDecode1Type; - m_in : in MmuToIcacheType; - stall_in : in std_ulogic; stall_out : out std_ulogic; flush_in : in std_ulogic; @@ -139,49 +133,24 @@ architecture rtl of icache is -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0); - -- The cache tags LUTRAM has a row per set. Vivado is a pain and will - -- not handle a clean (commented) definition of the cache tags as a 3d - -- memory. For now, work around it by putting all the tags + -- We define a cache tag RAM per way, accessed synchronously subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); --- type cache_tags_set_t is array(way_t) of cache_tag_t; --- type cache_tags_array_t is array(index_t) of cache_tags_set_t; - constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; - subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); - type cache_tags_array_t is array(index_t) of cache_tags_set_t; + type cache_tags_set_t is array(way_t) of cache_tag_t; + type cache_tags_array_t is array(index_t) of cache_tag_t; + + -- Set of cache tags read on the last clock edge + signal cache_tags_set : cache_tags_set_t; + -- Set of cache tags for snooping writes to memory + signal snoop_tags_set : cache_tags_set_t; + -- Flags indicating write-hit-read on the cache tags + signal tag_overwrite : std_ulogic_vector(NUM_WAYS - 1 downto 0); -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_tags : cache_tags_array_t; signal cache_valids : cache_valids_t; - attribute ram_style : string; - attribute ram_style of cache_tags : signal is "distributed"; - - -- L1 ITLB. - constant TLB_BITS : natural := log2(TLB_SIZE); - constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); - constant TLB_PTE_BITS : natural := 64; - - subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; - type tlb_valids_t is array(tlb_index_t) of std_ulogic; - subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); - type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; - subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); - type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; - - signal itlb_valids : tlb_valids_t; - signal itlb_tags : tlb_tags_t; - signal itlb_ptes : tlb_ptes_t; - attribute ram_style of itlb_tags : signal is "distributed"; - attribute ram_style of itlb_ptes : signal is "distributed"; - - -- Privilege bit from PTE EAA field - signal eaa_priv : std_ulogic; - -- Cache reload state machine type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK); @@ -189,6 +158,7 @@ architecture rtl of icache is -- Cache hit state (Latches for 1 cycle BRAM access) hit_way : way_sig_t; hit_nia : std_ulogic_vector(63 downto 0); + hit_ra : real_addr_t; hit_smark : std_ulogic; hit_valid : std_ulogic; big_endian: std_ulogic; @@ -208,6 +178,9 @@ architecture rtl of icache is end_row_ix : row_in_line_t; rows_valid : row_per_line_valid_t; + stalled_hit : std_ulogic; -- remembers hit while stalled + stalled_way : way_sig_t; + -- TLB miss state fetch_failed : std_ulogic; end record; @@ -226,9 +199,6 @@ architecture rtl of icache is signal req_raddr : real_addr_t; signal real_addr : real_addr_t; - signal ra_valid : std_ulogic; - signal priv_fault : std_ulogic; - signal access_ok : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -240,14 +210,16 @@ architecture rtl of icache is signal plru_victim : way_sig_t; -- Memory write snoop signals - signal snoop_valid : std_ulogic; - signal snoop_index : index_sig_t; - signal snoop_hits : cache_way_valids_t; + signal snoop_valid : std_ulogic; + signal snoop_index : index_sig_t; + signal snoop_tag : cache_tag_t; + signal snoop_index2 : index_sig_t; + signal snoop_hits : cache_way_valids_t; signal log_insn : std_ulogic_vector(35 downto 0); -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector) return index_sig_t is + function get_index(addr: real_addr_t) return index_sig_t is begin return unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)); end; @@ -321,29 +293,6 @@ architecture rtl of icache is return endian & addr(addr'left downto SET_SIZE_BITS); end; - -- Read a tag from a tag memory row - function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is - begin - return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); - end; - - -- Write a tag to tag memory row - procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; - tag: cache_tag_t) is - begin - tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; - end; - - -- Simple hash for direct-mapped TLB index - function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is - variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); - begin - hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ) - xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS) - xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS); - return hash; - end; - begin -- byte-swap read data if big endian @@ -415,7 +364,9 @@ begin signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal dout : cache_row_t; signal wr_sel : std_ulogic_vector(0 downto 0); + signal ic_tags : cache_tags_array_t; begin + -- Cache data RAMs, one per way way: entity work.cache_ram generic map ( ROW_BITS => ROW_BITS, @@ -443,6 +394,49 @@ begin wr_addr <= std_ulogic_vector(r.store_row); wr_sel(0) <= do_write; end process; + + -- Cache tag RAMs, one per way, are read and written synchronously. + -- They are instantiated like this instead of trying to describe them as + -- a single array in order to avoid problems with writing a single way. + process(clk) + variable replace_way : way_sig_t; + variable snoop_addr : real_addr_t; + variable next_raddr : real_addr_t; + begin + replace_way := to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + -- Get victim way from plru + replace_way := plru_victim; + end if; + if rising_edge(clk) then + -- Read tags using NIA for next cycle + if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then + next_raddr := i_in.next_rpn & i_in.next_nia(MIN_LG_PGSZ - 1 downto 0); + cache_tags_set(i) <= ic_tags(to_integer(get_index(next_raddr))); + -- Check for simultaneous write to the same location + tag_overwrite(i) <= '0'; + if r.state = CLR_TAG and r.store_index = get_index(next_raddr) and + to_unsigned(i, WAY_BITS) = replace_way then + tag_overwrite(i) <= '1'; + end if; + end if; + + -- Second read port for snooping writes to memory + if (wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we) = '1' then + snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr)); + snoop_tags_set(i) <= ic_tags(to_integer(get_index(snoop_addr))); + end if; + + -- Write one tag when in CLR_TAG state + if r.state = CLR_TAG and to_unsigned(i, WAY_BITS) = replace_way then + ic_tags(to_integer(r.store_index)) <= r.store_tag; + end if; + + if rst = '1' then + tag_overwrite(i) <= '0'; + end if; + end if; + end process; end generate; -- Generate PLRUs @@ -468,10 +462,10 @@ begin process(all) begin -- Read PLRU bits from array - if is_X(r.hit_nia) then + if is_X(r.hit_ra) then plru_cur <= (others => 'X'); else - plru_cur <= plru_ram(to_integer(get_index(r.hit_nia))); + plru_cur <= plru_ram(to_integer(get_index(r.hit_ra))); end if; -- PLRU interface @@ -484,92 +478,32 @@ begin begin if rising_edge(clk) then if r.hit_valid = '1' then - assert not is_X(r.hit_nia) severity failure; - plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd; + assert not is_X(r.hit_ra) severity failure; + plru_ram(to_integer(get_index(r.hit_ra))) <= plru_upd; end if; end if; end process; end generate; - -- TLB hit detection and real address generation - itlb_lookup : process(all) - variable pte : tlb_pte_t; - variable ttag : tlb_tag_t; - variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0); - begin - tlb_req_index := hash_ea(i_in.nia); - if is_X(tlb_req_index) then - pte := (others => 'X'); - ttag := (others => 'X'); - else - pte := itlb_ptes(to_integer(unsigned(tlb_req_index))); - ttag := itlb_tags(to_integer(unsigned(tlb_req_index))); - end if; - if i_in.virt_mode = '1' then - real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - i_in.nia(TLB_LG_PGSZ - 1 downto 0); - if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then - if is_X(tlb_req_index) then - ra_valid <= 'X'; - else - ra_valid <= itlb_valids(to_integer(unsigned(tlb_req_index))); - end if; - else - ra_valid <= '0'; - end if; - eaa_priv <= pte(3); - else - real_addr <= addr_to_real(i_in.nia); - ra_valid <= '1'; - eaa_priv <= '1'; - end if; - - -- no IAMR, so no KUEP support for now - priv_fault <= eaa_priv and not i_in.priv_mode; - access_ok <= ra_valid and not priv_fault; - end process; - - -- iTLB update - itlb_update: process(clk) - variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0); - begin - if rising_edge(clk) then - wr_index := hash_ea(m_in.addr); - if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then - -- clear all valid bits - for i in tlb_index_t loop - itlb_valids(i) <= '0'; - end loop; - elsif m_in.tlbie = '1' then - assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; - -- clear entry regardless of hit or miss - itlb_valids(to_integer(unsigned(wr_index))) <= '0'; - elsif m_in.tlbld = '1' then - assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; - itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS); - itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte; - itlb_valids(to_integer(unsigned(wr_index))) <= '1'; - end if; - ev.itlb_miss_resolved <= m_in.tlbld and not rst; - end if; - end process; - -- Cache hit detection, output to fetch2 and other misc logic icache_comb : process(all) variable is_hit : std_ulogic; variable hit_way : way_sig_t; variable insn : std_ulogic_vector(ICWORDLEN - 1 downto 0); variable icode : insn_code; + variable ra : real_addr_t; begin -- Extract line, row and tag from request - req_index <= get_index(i_in.nia); - req_row <= get_row(i_in.nia); - req_tag <= get_tag(real_addr, i_in.big_endian); + ra := i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0); + real_addr <= ra; + req_index <= get_index(ra); + req_row <= get_row(ra); + req_tag <= get_tag(ra, i_in.big_endian); -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed -- - req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + req_raddr <= ra(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way @@ -580,20 +514,27 @@ begin end if; for i in way_t loop if i_in.req = '1' and - (cache_valids(to_integer(req_index))(i) = '1' or - (r.state = WAIT_ACK and - req_index = r.store_index and - to_unsigned(i, WAY_BITS) = r.store_way and - r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1')) then - if read_tag(i, cache_tags(to_integer(req_index))) = req_tag then - hit_way := to_unsigned(i, WAY_BITS); - is_hit := '1'; - end if; + cache_valids(to_integer(req_index))(i) = '1' and + tag_overwrite(i) = '0' and + cache_tags_set(i) = req_tag then + hit_way := to_unsigned(i, WAY_BITS); + is_hit := '1'; end if; end loop; + if r.state = WAIT_ACK and r.store_valid = '1' and + req_index = r.store_index and + req_tag = r.store_tag and + r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1' then + is_hit := '1'; + hit_way := r.store_way; + end if; + if r.stalled_hit = '1' then + is_hit := '1'; + hit_way := r.stalled_way; + end if; -- Generate the "hit" and "miss" signals for the synchronous blocks - if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then + if i_in.req = '1' and flush_in = '0' and rst = '0' then req_is_hit <= is_hit; req_is_miss <= not is_hit; else @@ -610,19 +551,22 @@ begin -- I prefer not to do just yet as it would force fetch2 to know about -- some of the cache geometry information. -- - insn := (others => '0'); icode := INSN_illegal; - if r.hit_valid = '1' then - assert not is_X(r.hit_way) severity failure; + if is_X(r.hit_way) then + insn := (others => 'X'); + else insn := read_insn_word(r.hit_nia, cache_out(to_integer(r.hit_way))); - -- Currently we use only the top bit for indicating illegal - -- instructions because we know that insn_codes fit into 9 bits. - if is_X(insn) then - insn := (others => '0'); - elsif insn(ICWORDLEN - 1) = '0' then - icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); - end if; - end if; + end if; + assert not (r.hit_valid = '1' and is_X(r.hit_way)) severity failure; + -- Currently we use only the top bit for indicating illegal + -- instructions because we know that insn_codes fit into 9 bits. + if is_X(insn) then + insn := (others => '0'); + elsif insn(ICWORDLEN - 1) = '0' then + icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); + insn(31 downto 26) := recode_primary_opcode(icode); + end if; + i_out.insn <= insn(31 downto 0); i_out.icode <= icode; log_insn <= insn; @@ -634,8 +578,8 @@ begin i_out.next_predicted <= r.predicted; i_out.next_pred_ntaken <= r.pred_ntaken; - -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= not (is_hit and access_ok); + -- Stall fetch1 if we have a cache miss + stall_out <= i_in.req and not is_hit and not flush_in; -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -647,9 +591,17 @@ begin if rising_edge(clk) then -- keep outputs to fetch2 unchanged on a stall -- except that flush or reset sets valid to 0 - if stall_in = '1' then - if rst = '1' or flush_in = '1' then - r.hit_valid <= '0'; + if rst = '1' or flush_in = '1' then + r.hit_valid <= '0'; + r.stalled_hit <= '0'; + r.stalled_way <= to_unsigned(0, WAY_BITS); + elsif stall_in = '1' then + if r.state = CLR_TAG then + r.stalled_hit <= '0'; + elsif req_is_hit = '1' then + -- if we have a hit while stalled, remember it + r.stalled_hit <= '1'; + r.stalled_way <= req_hit_way; end if; else -- On a hit, latch the request for the next cycle, when the BRAM data @@ -669,14 +621,17 @@ begin " way:" & to_hstring(req_hit_way) & " RA:" & to_hstring(real_addr); end if; + r.stalled_hit <= '0'; end if; if stall_in = '0' then -- Send stop marks and NIA down regardless of validity r.hit_smark <= i_in.stop_mark; r.hit_nia <= i_in.nia; + r.hit_ra <= real_addr; r.big_endian <= i_in.big_endian; r.predicted <= i_in.predicted; r.pred_ntaken <= i_in.pred_ntaken; + r.fetch_failed <= i_in.fetch_fail and not flush_in; end if; if i_out.valid = '1' then assert not is_X(i_out.insn) severity failure; @@ -689,7 +644,6 @@ begin variable tagset : cache_tags_set_t; variable tag : cache_tag_t; variable snoop_addr : real_addr_t; - variable snoop_tag : cache_tag_t; variable snoop_cache_tags : cache_tags_set_t; variable replace_way : way_sig_t; begin @@ -722,15 +676,14 @@ begin snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we; snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr)); snoop_index <= get_index(snoop_addr); - snoop_tag := get_tag(snoop_addr, '0'); + snoop_tag <= get_tag(snoop_addr, '0'); snoop_hits <= (others => '0'); + + -- On the next cycle, match up tags with the snooped address + -- to see if any ways need to be invalidated if snoop_valid = '1' then - if is_X(snoop_addr) then - report "metavalue in snoop_addr" severity FAILURE; - end if; - snoop_cache_tags := cache_tags(to_integer(get_index(snoop_addr))); for i in way_t loop - tag := read_tag(i, snoop_cache_tags); + tag := snoop_tags_set(i); -- Ignore endian bit in comparison tag(TAG_BITS - 1) := '0'; if tag = snoop_tag then @@ -738,6 +691,7 @@ begin end if; end loop; end if; + snoop_index2 <= snoop_index; -- Process cache invalidations if inval_in = '1' then @@ -746,12 +700,12 @@ begin end loop; r.store_valid <= '0'; else - -- Do invalidations from snooped stores to memory, one - -- cycle after the address appears on wb_snoop_in. + -- Do invalidations from snooped stores to memory, + -- two cycles after the address appears on wb_snoop_in. for i in way_t loop if snoop_hits(i) = '1' then - assert not is_X(snoop_index) severity failure; - cache_valids(to_integer(snoop_index))(i) <= '0'; + assert not is_X(snoop_index2) severity failure; + cache_valids(to_integer(snoop_index2))(i) <= '0'; end if; end loop; end if; @@ -809,15 +763,6 @@ begin assert not is_X(replace_way) severity failure; cache_valids(to_integer(r.store_index))(to_integer(replace_way)) <= '0'; - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if to_unsigned(i, WAY_BITS) = replace_way then - tagset := cache_tags(to_integer(r.store_index)); - write_tag(i, tagset, r.store_tag); - cache_tags(to_integer(r.store_index)) <= tagset; - end if; - end loop; - r.state <= WAIT_ACK; end if; @@ -879,13 +824,6 @@ begin end if; end case; end if; - - -- TLB miss and protection fault processing - if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then - r.fetch_failed <= '0'; - elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then - r.fetch_failed <= '1'; - end if; end if; end process; @@ -915,8 +853,8 @@ begin wstate & std_ulogic_vector(resize(lway, 3)) & req_is_hit & req_is_miss & - access_ok & - ra_valid; + '1' & -- was access_ok + '1'; -- was ra_valid end if; end process; log_out <= log_data; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 83a84b31..05f7bd54 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -15,8 +15,6 @@ architecture behave of icache_tb is signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToDecode1Type; - signal m_out : MmuToIcacheType; - signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -32,7 +30,6 @@ begin rst => rst, i_in => i_out, i_out => i_in, - m_in => m_out, stall_in => '0', flush_in => '0', inval_in => '0', @@ -77,19 +74,21 @@ begin i_out.priv_mode <= '1'; i_out.virt_mode <= '0'; i_out.big_endian <= '0'; - - m_out.tlbld <= '0'; - m_out.tlbie <= '0'; - m_out.addr <= (others => '0'); - m_out.pte <= (others => '0'); + i_out.fetch_fail <= '0'; + i_out.predicted <= '0'; + i_out.pred_ntaken <= '0'; wait until rising_edge(clk); wait until rising_edge(clk); wait until rising_edge(clk); + + i_out.next_nia <= x"0000000000000004"; + i_out.next_rpn <= (others => '0'); wait until rising_edge(clk); i_out.req <= '1'; i_out.nia <= x"0000000000000004"; + i_out.rpn <= (others => '0'); wait for 30*clk_period; wait until rising_edge(clk); @@ -102,6 +101,7 @@ begin severity failure; i_out.req <= '0'; + i_out.next_nia <= x"0000000000000008"; wait until rising_edge(clk); @@ -116,6 +116,8 @@ begin "=" & to_hstring(i_in.insn) & " expected 00000002" severity failure; + + i_out.next_nia <= x"0000000000000040"; wait until rising_edge(clk); -- another miss @@ -133,6 +135,9 @@ begin severity failure; -- test something that aliases + i_out.next_nia <= x"0000000000000100"; + wait until rising_edge(clk); + i_out.req <= '1'; i_out.nia <= x"0000000000000100"; wait until rising_edge(clk); diff --git a/mmu.vhdl b/mmu.vhdl index 17748228..fb63cfd5 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -20,7 +20,7 @@ entity mmu is d_out : out MmuToDcacheType; d_in : in DcacheToMmuType; - i_out : out MmuToIcacheType + i_out : out MmuToITLBType ); end mmu; diff --git a/predecode.vhdl b/predecode.vhdl index 27f80e1f..d3ca0156 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -38,8 +38,38 @@ architecture behaviour of predecoder is 2#011100_00000# to 2#011100_11111# => INSN_andi_dot, 2#011101_00000# to 2#011101_11111# => INSN_andis_dot, 2#000000_00000# => INSN_attn, - 2#010010_00000# to 2#010010_11111# => INSN_b, - 2#010000_00000# to 2#010000_11111# => INSN_bc, + 2#010010_00000# to 2#010010_00001# => INSN_brel, + 2#010010_00010# to 2#010010_00011# => INSN_babs, + 2#010010_00100# to 2#010010_00101# => INSN_brel, + 2#010010_00110# to 2#010010_00111# => INSN_babs, + 2#010010_01000# to 2#010010_01001# => INSN_brel, + 2#010010_01010# to 2#010010_01011# => INSN_babs, + 2#010010_01100# to 2#010010_01101# => INSN_brel, + 2#010010_01110# to 2#010010_01111# => INSN_babs, + 2#010010_10000# to 2#010010_10001# => INSN_brel, + 2#010010_10010# to 2#010010_10011# => INSN_babs, + 2#010010_10100# to 2#010010_10101# => INSN_brel, + 2#010010_10110# to 2#010010_10111# => INSN_babs, + 2#010010_11000# to 2#010010_11001# => INSN_brel, + 2#010010_11010# to 2#010010_11011# => INSN_babs, + 2#010010_11100# to 2#010010_11101# => INSN_brel, + 2#010010_11110# to 2#010010_11111# => INSN_babs, + 2#010000_00000# to 2#010000_00001# => INSN_bcrel, + 2#010000_00010# to 2#010000_00011# => INSN_bcabs, + 2#010000_00100# to 2#010000_00101# => INSN_bcrel, + 2#010000_00110# to 2#010000_00111# => INSN_bcabs, + 2#010000_01000# to 2#010000_01001# => INSN_bcrel, + 2#010000_01010# to 2#010000_01011# => INSN_bcabs, + 2#010000_01100# to 2#010000_01101# => INSN_bcrel, + 2#010000_01110# to 2#010000_01111# => INSN_bcabs, + 2#010000_10000# to 2#010000_10001# => INSN_bcrel, + 2#010000_10010# to 2#010000_10011# => INSN_bcabs, + 2#010000_10100# to 2#010000_10101# => INSN_bcrel, + 2#010000_10110# to 2#010000_10111# => INSN_bcabs, + 2#010000_11000# to 2#010000_11001# => INSN_bcrel, + 2#010000_11010# to 2#010000_11011# => INSN_bcabs, + 2#010000_11100# to 2#010000_11101# => INSN_bcrel, + 2#010000_11110# to 2#010000_11111# => INSN_bcabs, 2#001011_00000# to 2#001011_11111# => INSN_cmpi, 2#001010_00000# to 2#001010_11111# => INSN_cmpli, 2#100010_00000# to 2#100010_11111# => INSN_lbz, @@ -220,9 +250,9 @@ architecture behaviour of predecoder is 2#0_11111_01001# => INSN_divd, -- divdo 2#0_01111_01011# => INSN_divw, 2#0_11111_01011# => INSN_divw, -- divwo - 2#0_11001_10110# => INSN_nop, -- dss - 2#0_01010_10110# => INSN_nop, -- dst - 2#0_01011_10110# => INSN_nop, -- dstst + 2#0_11001_10110# => INSN_rnop, -- dss + 2#0_01010_10110# => INSN_rnop, -- dst + 2#0_01011_10110# => INSN_rnop, -- dstst 2#0_11010_10110# => INSN_eieio, 2#0_01000_11100# => INSN_eqv, 2#0_11101_11010# => INSN_extsb, @@ -322,14 +352,14 @@ architecture behaviour of predecoder is 2#0_00011_01000# => INSN_neg, 2#0_10011_01000# => INSN_neg, -- nego -- next 8 are reserved no-op instructions - 2#0_10000_10010# => INSN_nop, - 2#0_10001_10010# => INSN_nop, - 2#0_10010_10010# => INSN_nop, - 2#0_10011_10010# => INSN_nop, - 2#0_10100_10010# => INSN_nop, - 2#0_10101_10010# => INSN_nop, - 2#0_10110_10010# => INSN_nop, - 2#0_10111_10010# => INSN_nop, + 2#0_10000_10010# => INSN_rnop, + 2#0_10001_10010# => INSN_rnop, + 2#0_10010_10010# => INSN_rnop, + 2#0_10011_10010# => INSN_rnop, + 2#0_10100_10010# => INSN_rnop, + 2#0_10101_10010# => INSN_rnop, + 2#0_10110_10010# => INSN_rnop, + 2#0_10111_10010# => INSN_rnop, 2#0_00011_11100# => INSN_nor, 2#0_01101_11100# => INSN_or, 2#0_01100_11100# => INSN_orc, diff --git a/writeback.vhdl b/writeback.vhdl index 7fef5c38..6a86fb72 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -160,34 +160,21 @@ begin end if; -- Outputs to fetch1 + f.interrupt := intr; + f.intr_vec := std_ulogic_vector(to_unsigned(vec, 12)); f.redirect := e_in.redirect; + f.redirect_nia := e_in.write_data; f.br_nia := e_in.last_nia; - f.br_last := e_in.br_last; + f.br_last := e_in.br_last and not intr; f.br_taken := e_in.br_taken; - if intr = '1' then - f.redirect := '1'; - f.br_last := '0'; - f.redirect_nia := std_ulogic_vector(to_unsigned(vec, 64)); - f.virt_mode := '0'; - f.priv_mode := '1'; - -- XXX need an interrupt LE bit here, e.g. from LPCR - f.big_endian := '0'; - f.mode_32bit := '0'; - else - if e_in.abs_br = '1' then - f.redirect_nia := e_in.br_offset; - else - f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset)); - end if; - -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 - f.virt_mode := e_in.redir_mode(3); - f.priv_mode := e_in.redir_mode(2); - f.big_endian := e_in.redir_mode(1); - f.mode_32bit := e_in.redir_mode(0); - end if; + -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 + f.virt_mode := e_in.redir_mode(3); + f.priv_mode := e_in.redir_mode(2); + f.big_endian := e_in.redir_mode(1); + f.mode_32bit := e_in.redir_mode(0); f_out <= f; - flush_out <= f_out.redirect; + flush_out <= f_out.redirect or intr; -- Register write data bypass to decode2 wb_bypass.tag.tag <= complete_out.tag;