summaryrefslogtreecommitdiffstats
path: root/zpu/hdl/zealot/zpu_medium.vhdl
diff options
context:
space:
mode:
Diffstat (limited to 'zpu/hdl/zealot/zpu_medium.vhdl')
-rw-r--r--zpu/hdl/zealot/zpu_medium.vhdl948
1 files changed, 948 insertions, 0 deletions
diff --git a/zpu/hdl/zealot/zpu_medium.vhdl b/zpu/hdl/zealot/zpu_medium.vhdl
new file mode 100644
index 0000000..47950fe
--- /dev/null
+++ b/zpu/hdl/zealot/zpu_medium.vhdl
@@ -0,0 +1,948 @@
+------------------------------------------------------------------------------
+---- ----
+---- ZPU Medium ----
+---- ----
+---- http://www.opencores.org/ ----
+---- ----
+---- Description: ----
+---- ZPU is a 32 bits small stack cpu. This is the medium size version. ----
+---- Supports external memories. ----
+---- ----
+---- To Do: ----
+---- - ----
+---- ----
+---- Author: ----
+---- - Øyvind Harboe, oyvind.harboe zylin.com ----
+---- - Salvador E. Tropea, salvador inti.gob.ar ----
+---- ----
+------------------------------------------------------------------------------
+---- ----
+---- Copyright (c) 2008 Øyvind Harboe <oyvind.harboe zylin.com> ----
+---- Copyright (c) 2008 Salvador E. Tropea <salvador inti.gob.ar> ----
+---- Copyright (c) 2008 Instituto Nacional de Tecnología Industrial ----
+---- ----
+---- Distributed under the BSD license ----
+---- ----
+------------------------------------------------------------------------------
+---- ----
+---- Design unit: ZPUMediumCore(Behave) (Entity and architecture) ----
+---- File name: zpu_medium.vhdl ----
+---- Note: None ----
+---- Limitations: None known ----
+---- Errors: None known ----
+---- Library: zpu ----
+---- Dependencies: IEEE.std_logic_1164 ----
+---- IEEE.numeric_std ----
+---- zpu.zpupkg ----
+---- Target FPGA: Spartan 3 (XC3S400-4-FT256) ----
+---- Language: VHDL ----
+---- Wishbone: No ----
+---- Synthesis tools: Xilinx Release 9.2.03i - xst J.39 ----
+---- Simulation tools: GHDL [Sokcho edition] (0.2x) ----
+---- Text editor: SETEdit 0.5.x ----
+---- ----
+------------------------------------------------------------------------------
+--
+-- write_en_o - set to '1' for a single cycle to send off a write request.
+-- data_o is valid only while write_en_o='1'.
+-- read_en_o - set to '1' for a single cycle to send off a read request.
+-- mem_busy_i - It is illegal to send off a read/write request when
+-- mem_busy_i='1'.
+-- Set to '0' when data_i is valid after a read request.
+-- If it goes to '1'(busy), it is on the cycle after read/
+-- write_en_o is '1'.
+-- addr_o - address for read/write request
+-- data_i - read data. Valid only on the cycle after mem_busy_i='0'
+-- after read_en_o='1' for a single cycle.
+-- data_o - data to write
+-- break_o - set to '1' when CPU hits break instruction
+
+library IEEE;
+use IEEE.std_logic_1164.all;
+use IEEE.numeric_std.all;
+
+library zpu;
+use zpu.zpupkg.all;
+
+entity ZPUMediumCore is
+ generic(
+ WORD_SIZE : integer:=32; -- 16/32 (2**wordPower)
+ ADDR_W : integer:=16; -- Total address space width (incl. I/O)
+ MEM_W : integer:=15; -- Memory (prog+data+stack) width
+ D_CARE_VAL : std_logic:='X'; -- Value used to fill the unsused bits
+ MULT_PIPE : boolean:=false; -- Pipeline multiplication
+ BINOP_PIPE : integer range 0 to 2:=0; -- Pipeline binary operations (-, =, < and <=)
+ ENA_LEVEL0 : boolean:=true; -- eq, loadb, neqbranch and pushspadd
+ ENA_LEVEL1 : boolean:=true; -- lessthan, ulessthan, mult, storeb, callpcrel and sub
+ ENA_LEVEL2 : boolean:=false; -- lessthanorequal, ulessthanorequal, call and poppcrel
+ ENA_LSHR : boolean:=true; -- lshiftright
+ ENA_IDLE : boolean:=false; -- Enable the enable_i input
+ FAST_FETCH : boolean:=true); -- Merge the st_fetch with the st_execute states
+ port(
+ clk_i : in std_logic; -- CPU Clock
+ reset_i : in std_logic; -- Sync Reset
+ enable_i : in std_logic; -- Hold the CPU (after reset)
+ break_o : out std_logic; -- Break instruction executed
+ dbg_o : out zpu_dbgo_t; -- Debug outputs (i.e. trace log)
+ -- Memory interface
+ mem_busy_i : in std_logic; -- Memory is busy
+ data_i : in unsigned(WORD_SIZE-1 downto 0); -- Data from mem
+ data_o : out unsigned(WORD_SIZE-1 downto 0); -- Data to mem
+ addr_o : out unsigned(ADDR_W-1 downto 0); -- Memory address
+ write_en_o : out std_logic; -- Memory write enable
+ read_en_o : out std_logic); -- Memory read enable
+end entity ZPUMediumCore;
+
+architecture Behave of ZPUMediumCore is
+ constant BYTE_BITS : integer:=WORD_SIZE/16; -- # of bits in a word that addresses bytes
+ constant WORD_BYTES : integer:=WORD_SIZE/OPCODE_W;
+ constant MAX_ADDR_BIT : integer:=ADDR_W-2;
+ -- Stack Pointer initial value: BRAM size-8
+ constant SP_START_1 : unsigned(ADDR_W-1 downto 0):=to_unsigned((2**MEM_W)-8,ADDR_W);
+ constant SP_START : unsigned(ADDR_W-1 downto BYTE_BITS):=
+ SP_START_1(ADDR_W-1 downto BYTE_BITS);
+
+ -- Update [SP+1]. We hold it in b_r, this writes the value to memory.
+ procedure FlushB(signal we : out std_logic;
+ signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal inc_sp : in unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal data : out unsigned(WORD_SIZE-1 downto 0);
+ signal b : in unsigned(WORD_SIZE-1 downto 0)) is
+ begin
+ we <= '1';
+ addr <= inc_sp;
+ data <= b;
+ end procedure FlushB;
+
+ -- Do a simple stack push, it is performed in the internal cache registers,
+ -- not in the real memory.
+ procedure Push(signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal a : in unsigned(WORD_SIZE-1 downto 0);
+ signal b : out unsigned(WORD_SIZE-1 downto 0)) is
+ begin
+ b <= a; -- Update cache [SP+1]=[SP]
+ sp <= sp-1;
+ end procedure Push;
+
+ -- Do a simple stack pop, it is performed in the internal cache registers,
+ -- not in the real memory.
+ procedure Pop(signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal a : out unsigned(WORD_SIZE-1 downto 0);
+ signal b : in unsigned(WORD_SIZE-1 downto 0)) is
+ begin
+ a <= b; -- Update cache [SP]=[SP+1]
+ sp <= sp+1;
+ end procedure Pop;
+
+ -- Expand a PC value to WORD_SIZE
+ function ExpandPC(v : unsigned(ADDR_W-1 downto 0)) return unsigned is
+ variable nv : unsigned(WORD_SIZE-1 downto 0);
+ begin
+ nv:=(others => '0');
+ nv(ADDR_W-1 downto 0):=v;
+ return nv;
+ end function ExpandPC;
+
+ -- Program counter
+ signal pc_r : unsigned(ADDR_W-1 downto 0):=(others => '0');
+ -- Stack pointer
+ signal sp_r : unsigned(ADDR_W-1 downto BYTE_BITS):=SP_START;
+ -- SP+1, SP+2 and SP-1 are very used, these are shortcuts
+ signal inc_sp : unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal inc_inc_sp : unsigned(ADDR_W-1 downto BYTE_BITS);
+ -- a_r is a cache for the top of the stack [SP]
+ -- Note: as this is a stack CPU this is a very important register.
+ signal a_r : unsigned(WORD_SIZE-1 downto 0);
+ -- b_r is a cache for the next value in the stack [SP+1]
+ signal b_r : unsigned(WORD_SIZE-1 downto 0);
+ signal bin_op_res1_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
+ signal bin_op_res2_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
+ signal mult_res1_r : unsigned(WORD_SIZE-1 downto 0);
+ signal mult_res2_r : unsigned(WORD_SIZE-1 downto 0);
+ signal mult_res3_r : unsigned(WORD_SIZE-1 downto 0);
+ signal mult_a_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
+ signal mult_b_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
+ signal idim_r : std_logic;
+ signal write_en_r : std_logic;
+ signal read_en_r : std_logic;
+ signal addr_r : unsigned(ADDR_W-1 downto BYTE_BITS):=(others => '0');
+ signal fetched_w_r : unsigned(WORD_SIZE-1 downto 0);
+
+ type state_t is(st_load2, st_popped, st_load_sp2, st_load_sp3, st_add_sp2,
+ st_fetch, st_execute, st_decode, st_decode2, st_resync,
+ st_store_sp2, st_resync2, st_resync3, st_loadb2, st_storeb2,
+ st_mult2, st_mult3, st_mult5, st_mult4, st_binary_op_res2,
+ st_binary_op_res, st_idle);
+ signal state : state_t:=st_resync;
+
+ -- Go to st_fetch state or just do its work
+ procedure DoFetch(constant FAST : boolean;
+ signal state : out state_t;
+ signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal pc : in unsigned(ADDR_W-1 downto 0);
+ signal re : out std_logic;
+ signal busy : in std_logic) is
+ begin
+ if FAST then
+ -- Equivalent to st_fetch
+ if busy='0' then
+ addr <= pc(ADDR_W-1 downto BYTE_BITS);
+ re <= '1';
+ state <= st_decode;
+ end if;
+ else
+ state <= st_fetch;
+ end if;
+ end procedure DoFetch;
+
+ -- Perform a "binary operation" (2 operands)
+ procedure DoBinOp(result : in unsigned(WORD_SIZE-1 downto 0);
+ signal state : out state_t;
+ signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal re : out std_logic;
+ signal dest : out unsigned(WORD_SIZE-1 downto 0);
+ signal dest_p : out unsigned(WORD_SIZE-1 downto 0);
+ constant DEPTH : natural) is
+ begin
+ if DEPTH=2 then
+ -- 2 clocks: st_binary_op_res+st_binary_op_res2
+ state <= st_binary_op_res;
+ dest_p <= result;
+ elsif DEPTH=1 then
+ -- 1 clock: st_binary_op_res2
+ state <= st_binary_op_res2;
+ dest_p <= result;
+ else -- 0 clocks
+ re <= '1';
+ addr <= sp+2;
+ sp <= sp+1;
+ dest <= result;
+ state <= st_popped;
+ end if;
+ end procedure DoBinOp;
+
+ -- Perform a boolean "binary operation" (2 operands)
+ procedure DoBinOpBool(result : in boolean;
+ signal state : out state_t;
+ signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
+ signal re : out std_logic;
+ signal dest : out unsigned(WORD_SIZE-1 downto 0);
+ signal dest_p : out unsigned(WORD_SIZE-1 downto 0);
+ constant DEPTH : natural) is
+ variable res : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
+ begin
+ if result then
+ res(0):='1';
+ end if;
+ DoBinOp(res,state,sp,addr,re,dest,dest_p,DEPTH);
+ end procedure DoBinOpBool;
+
+ type insn_t is (dec_add_top, dec_dup, dec_dup_stk_b, dec_pop, dec_add,
+ dec_or, dec_and, dec_store, dec_add_sp, dec_shift, dec_nop,
+ dec_im, dec_load_sp, dec_store_sp, dec_emulate, dec_load,
+ dec_push_sp, dec_pop_pc, dec_pop_pc_rel, dec_not, dec_flip,
+ dec_pop_sp, dec_neq_branch, dec_eq, dec_loadb, dec_mult,
+ dec_less_than, dec_less_than_or_equal, dec_lshr,
+ dec_u_less_than_or_equal, dec_u_less_than, dec_push_sp_add,
+ dec_call, dec_call_pc_rel, dec_sub, dec_break, dec_storeb,
+ dec_insn_fetch, dec_pop_down);
+ signal insn : insn_t;
+ type insn_array_t is array(0 to WORD_BYTES-1) of insn_t;
+ signal insns : insn_array_t;
+ type opcode_array_t is array(0 to WORD_BYTES-1) of unsigned(OPCODE_W-1 downto 0);
+ signal opcode_r : opcode_array_t;
+begin
+ -- the memory subsystem will tell us one cycle later whether or
+ -- not it is busy
+ write_en_o <= write_en_r;
+ read_en_o <= read_en_r;
+ addr_o(ADDR_W-1 downto BYTE_BITS) <= addr_r;
+ addr_o(BYTE_BITS-1 downto 0) <= (others => '0');
+
+ -- SP+1 and +2
+ inc_sp <= sp_r+1;
+ inc_inc_sp <= sp_r+2;
+
+ opcode_control:
+ process (clk_i)
+ variable topcode : unsigned(OPCODE_W-1 downto 0);
+ variable ex_opcode : unsigned(OPCODE_W-1 downto 0);
+ variable sp_offset : unsigned(4 downto 0);
+ variable tsp_offset : unsigned(4 downto 0);
+ variable next_pc : unsigned(ADDR_W-1 downto 0);
+ variable tdecoded : insn_t;
+ variable tinsns : insn_array_t;
+ variable mult_res : unsigned(WORD_SIZE*2-1 downto 0);
+ variable ipc_low : integer range 0 to 3; -- Address inside a word (pc_r)
+ variable inpc_low : integer range 0 to 3; -- Address inside a word (next_pc)
+ variable h_bit : integer;
+ variable l_bit : integer;
+ variable not_lshr : std_logic:='1';
+ begin
+ if rising_edge(clk_i) then
+ break_o <= '0';
+ if reset_i='1' then
+ if ENA_IDLE then
+ state <= st_idle;
+ else
+ state <= st_resync;
+ end if;
+ sp_r <= SP_START;
+ pc_r <= (others => '0');
+ idim_r <= '0';
+ write_en_r <= '0';
+ read_en_r <= '0';
+ mult_a_r <= (others => '0');
+ mult_b_r <= (others => '0');
+ dbg_o.b_inst <= '0';
+ -- Reseting add_r here makes XST fail to use BRAMs ?!
+ else -- reset_i='1'
+ if MULT_PIPE then
+ -- We must multiply unconditionally to get pipelined multiplication
+ mult_res:=mult_a_r*mult_b_r;
+ mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
+ mult_res2_r <= mult_res1_r;
+ mult_res3_r <= mult_res2_r;
+ mult_a_r <= (others => D_CARE_VAL);
+ mult_b_r <= (others => D_CARE_VAL);
+ end if;
+
+ if BINOP_PIPE=2 then
+ bin_op_res2_r <= bin_op_res1_r; -- pipeline a bit.
+ end if;
+
+ read_en_r <='0';
+ write_en_r <='0';
+ -- Allow synthesis tools to load bogus values when we don't
+ -- care about the address and output data.
+ addr_r <= (others => D_CARE_VAL);
+ data_o <= (others => D_CARE_VAL);
+
+ if (write_en_r='1') and (read_en_r='1') then
+ report "read/write collision" severity failure;
+ end if;
+
+ ipc_low:=to_integer(pc_r(BYTE_BITS-1 downto 0));
+ sp_offset(4):=not opcode_r(ipc_low)(4);
+ sp_offset(3 downto 0):=opcode_r(ipc_low)(3 downto 0);
+ next_pc:=pc_r+1;
+
+ -- Prepare trace snapshot
+ dbg_o.opcode <= opcode_r(ipc_low);
+ dbg_o.pc <= resize(pc_r,32);
+ dbg_o.stk_a <= resize(a_r,32);
+ dbg_o.stk_b <= resize(b_r,32);
+ dbg_o.b_inst <= '0';
+ dbg_o.sp <= (others => '0');
+ dbg_o.sp(ADDR_W-1 downto BYTE_BITS) <= sp_r;
+
+ case state is
+ when st_idle =>
+ if enable_i='1' then
+ state <= st_resync;
+ end if;
+ -- Initial state of ZPU, fetch top of stack (A/B) + first instruction
+ when st_resync =>
+ if mem_busy_i='0' then
+ addr_r <= sp_r;
+ read_en_r <= '1';
+ state <= st_resync2;
+ end if;
+ when st_resync2 =>
+ if mem_busy_i='0' then
+ a_r <= data_i;
+ addr_r <= inc_sp;
+ read_en_r <= '1';
+ state <= st_resync3;
+ end if;
+ when st_resync3 =>
+ if mem_busy_i='0' then
+ b_r <= data_i;
+ addr_r <= pc_r(ADDR_W-1 downto BYTE_BITS);
+ read_en_r <= '1';
+ state <= st_decode;
+ end if;
+ when st_decode =>
+ if mem_busy_i='0' then
+ -- Here we latch the fetched word to give one full clock
+ -- cycle to the instruction decoder. This could be removed
+ -- if using BRAMs and the decoder delay isn't important.
+ fetched_w_r <= data_i;
+ state <= st_decode2;
+ end if;
+ when st_decode2 =>
+ -- decode 4 instructions in parallel
+ for i in 0 to WORD_BYTES-1 loop
+ topcode:=fetched_w_r((WORD_BYTES-1-i+1)*8-1 downto (WORD_BYTES-1-i)*8);
+
+ tsp_offset(4):=not topcode(4);
+ tsp_offset(3 downto 0):=topcode(3 downto 0);
+
+ opcode_r(i) <= topcode;
+ if topcode(7 downto 7)=OPCODE_IM then
+ tdecoded:=dec_im;
+ elsif topcode(7 downto 5)=OPCODE_STORESP then
+ if tsp_offset=0 then
+ -- Special case, we can avoid a write
+ tdecoded:=dec_pop;
+ elsif tsp_offset=1 then
+ -- Special case, collision
+ tdecoded:=dec_pop_down;
+ else
+ tdecoded:=dec_store_sp;
+ end if;
+ elsif topcode(7 downto 5)=OPCODE_LOADSP then
+ if tsp_offset=0 then
+ tdecoded:=dec_dup;
+ elsif tsp_offset=1 then
+ tdecoded:=dec_dup_stk_b;
+ else
+ tdecoded:=dec_load_sp;
+ end if;
+ elsif topcode(7 downto 5)=OPCODE_EMULATE then
+ tdecoded:=dec_emulate;
+ if ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_NEQBRANCH then
+ tdecoded:=dec_neq_branch;
+ elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_EQ then
+ tdecoded:=dec_eq;
+ elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_LOADB then
+ tdecoded:=dec_loadb;
+ elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_PUSHSPADD then
+ tdecoded:=dec_push_sp_add;
+ elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_LESSTHAN then
+ tdecoded:=dec_less_than;
+ elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_ULESSTHAN then
+ tdecoded:=dec_u_less_than;
+ elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_MULT then
+ tdecoded:=dec_mult;
+ elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_STOREB then
+ tdecoded:=dec_storeb;
+ elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_CALLPCREL then
+ tdecoded:=dec_call_pc_rel;
+ elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_SUB then
+ tdecoded:=dec_sub;
+ elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_LESSTHANOREQUAL then
+ tdecoded:=dec_less_than_or_equal;
+ elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_ULESSTHANOREQUAL then
+ tdecoded:=dec_u_less_than_or_equal;
+ elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_CALL then
+ tdecoded:=dec_call;
+ elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_POPPCREL then
+ tdecoded:=dec_pop_pc_rel;
+ elsif ENA_LSHR and topcode(5 downto 0)=OPCODE_LSHIFTRIGHT then
+ tdecoded:=dec_lshr;
+ end if;
+ elsif topcode(7 downto 4)=OPCODE_ADDSP then
+ if tsp_offset=0 then
+ tdecoded:=dec_shift;
+ elsif tsp_offset=1 then
+ tdecoded:=dec_add_top;
+ else
+ tdecoded:=dec_add_sp;
+ end if;
+ else -- OPCODE_SHORT
+ case topcode(3 downto 0) is
+ when OPCODE_BREAK =>
+ tdecoded:=dec_break;
+ when OPCODE_PUSHSP =>
+ tdecoded:=dec_push_sp;
+ when OPCODE_POPPC =>
+ tdecoded:=dec_pop_pc;
+ when OPCODE_ADD =>
+ tdecoded:=dec_add;
+ when OPCODE_OR =>
+ tdecoded:=dec_or;
+ when OPCODE_AND =>
+ tdecoded:=dec_and;
+ when OPCODE_LOAD =>
+ tdecoded:=dec_load;
+ when OPCODE_NOT =>
+ tdecoded:=dec_not;
+ when OPCODE_FLIP =>
+ tdecoded:=dec_flip;
+ when OPCODE_STORE =>
+ tdecoded:=dec_store;
+ when OPCODE_POPSP =>
+ tdecoded:=dec_pop_sp;
+ when others => -- OPCODE_NOP and others
+ tdecoded:=dec_nop;
+ end case;
+ end if;
+ tinsns(i):=tdecoded;
+ end loop;
+
+ insn <= tinsns(ipc_low);
+ -- once we wrap, we need to fetch
+ tinsns(0):=dec_insn_fetch;
+ insns <= tinsns;
+ state <= st_execute;
+
+ -- Each instruction must:
+ --
+ -- 1. increase pc_r if applicable
+ -- 2. set next state if applicable
+ -- 3. do it's operation
+ when st_execute =>
+ -- Some shortcut to make the code readable:
+ inpc_low:=to_integer(next_pc(BYTE_BITS-1 downto 0));
+ ex_opcode:=opcode_r(ipc_low);
+ insn <= insns(inpc_low);
+ -- Defaults used by most instructions
+ if insn/=dec_insn_fetch and insn/=dec_im then
+ dbg_o.b_inst <= '1';
+ idim_r <= '0';
+ end if;
+ case insn is
+ when dec_insn_fetch =>
+ -- Not a real instruction, fetch new instructions
+ DoFetch(FAST_FETCH,state,addr_r,pc_r,read_en_r,mem_busy_i);
+ when dec_im =>
+ -- Push(immediate value), IDIM=1
+ -- if IDIM=0 Push(signed(opcode & 0x7F)) else
+ -- Push((Pop()<<7)|(opcode&0x7F))
+ if mem_busy_i='0' then
+ dbg_o.b_inst <= '1';
+ idim_r <= '1';
+ pc_r <= pc_r+1;
+ if idim_r='1' then
+ -- We already started an IM sequence
+ -- Shift left 7 bits
+ a_r(WORD_SIZE-1 downto 7) <= a_r(WORD_SIZE-8 downto 0);
+ -- Put the new value
+ a_r(6 downto 0) <= ex_opcode(6 downto 0);
+ else
+ -- First IM, push the value sign extended
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ a_r <= unsigned(resize(signed(ex_opcode(6 downto 0)),WORD_SIZE));
+ Push(sp_r,a_r,b_r);
+ end if;
+ end if;
+ when dec_store_sp =>
+ -- [SP+Offset]=Pop()
+ if mem_busy_i='0' then
+ write_en_r <= '1';
+ addr_r <= sp_r+sp_offset;
+ data_o <= a_r;
+ Pop(sp_r,a_r,b_r);
+ -- We need to fetch B
+ state <= st_store_sp2;
+ end if;
+ when dec_load_sp =>
+ -- Push([SP+Offset])
+ if mem_busy_i='0' then
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ Push(sp_r,a_r,b_r);
+ -- We are flushing B cache, so we need more time to
+ -- read the value.
+ state <= st_load_sp2;
+ end if;
+ when dec_emulate =>
+ -- Push(PC+1), PC=Opcode[4:0]*32
+ if mem_busy_i='0' then
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ state <= st_fetch;
+ a_r <= ExpandPC(pc_r+1);
+ Push(sp_r,a_r,b_r);
+ -- The emulate address is:
+ -- 98 7654 3210
+ -- 0000 00aa aaa0 0000
+ pc_r <= (others => '0');
+ pc_r(9 downto 5) <= ex_opcode(4 downto 0);
+ end if;
+ when dec_call_pc_rel =>
+ -- t=Pop(), Push(PC+1), PC=PC+t
+ if mem_busy_i='0' and ENA_LEVEL1 then
+ state <= st_fetch;
+ a_r <= ExpandPC(pc_r+1);
+ pc_r <= pc_r+a_r(ADDR_W-1 downto 0);
+ end if;
+ when dec_call =>
+ -- t=Pop(), Push(PC+1), PC=t
+ if mem_busy_i='0' and ENA_LEVEL2 then
+ state <= st_fetch;
+ a_r <= ExpandPC(pc_r+1);
+ pc_r <= a_r(ADDR_W-1 downto 0);
+ end if;
+ when dec_add_sp =>
+ -- Push(Pop()+[SP+Offset])
+ if mem_busy_i='0' then
+ -- Read SP+Offset
+ state <= st_add_sp2;
+ read_en_r <= '1';
+ addr_r <= sp_r+sp_offset;
+ pc_r <= pc_r+1;
+ end if;
+ when dec_push_sp =>
+ -- Push(SP)
+ if mem_busy_i='0' then
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ pc_r <= pc_r+1;
+ a_r <= (others => '0');
+ a_r(ADDR_W-1 downto BYTE_BITS) <= sp_r;
+ Push(sp_r,a_r,b_r);
+ end if;
+ when dec_pop_pc =>
+ -- PC=Pop() (return)
+ if mem_busy_i='0' then
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ state <= st_resync;
+ pc_r <= a_r(ADDR_W-1 downto 0);
+ sp_r <= inc_sp;
+ end if;
+ when dec_pop_pc_rel =>
+ -- PC=PC+Pop()
+ if mem_busy_i='0' and ENA_LEVEL2 then
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ state <= st_resync;
+ pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
+ sp_r <= inc_sp;
+ end if;
+ when dec_add =>
+ -- Push(Pop()+Pop()) [A=A+B, SP++, update B]
+ if mem_busy_i='0' then
+ state <= st_popped;
+ a_r <= a_r+b_r;
+ read_en_r <= '1';
+ addr_r <= inc_inc_sp;
+ sp_r <= inc_sp;
+ end if;
+ when dec_sub =>
+ -- a=Pop(), b=Pop(), Push(b-a)
+ if mem_busy_i='0' and ENA_LEVEL1 then
+ DoBinOp(b_r-a_r,state,sp_r,addr_r,read_en_r,
+ a_r,bin_op_res1_r,BINOP_PIPE);
+ end if;
+ when dec_pop =>
+ -- Pop()
+ if mem_busy_i='0' then
+ state <= st_popped;
+ addr_r <= inc_inc_sp;
+ read_en_r <= '1';
+ Pop(sp_r,a_r,b_r);
+ end if;
+ when dec_pop_down =>
+ -- t=Pop(), Pop(), Push(t)
+ if mem_busy_i='0' then
+ -- PopDown leaves top of stack unchanged
+ state <= st_popped;
+ addr_r <= inc_inc_sp;
+ read_en_r <= '1';
+ sp_r <= inc_sp;
+ end if;
+ when dec_or =>
+ -- Push(Pop() or Pop())
+ if mem_busy_i='0' then
+ state <= st_popped;
+ a_r <= a_r or b_r;
+ read_en_r <= '1';
+ addr_r <= inc_inc_sp;
+ sp_r <= inc_sp;
+ end if;
+ when dec_and =>
+ -- Push(Pop() and Pop())
+ if mem_busy_i='0' then
+ state <= st_popped;
+ a_r <= a_r and b_r;
+ read_en_r <= '1';
+ addr_r <= inc_inc_sp;
+ sp_r <= inc_sp;
+ end if;
+ when dec_eq =>
+ -- a=Pop(), b=Pop(), Push(a=b ? 1 : 0)
+ if mem_busy_i='0' and ENA_LEVEL0 then
+ DoBinOpBool(a_r=b_r,state,sp_r,addr_r,read_en_r,
+ a_r,bin_op_res1_r,BINOP_PIPE);
+ end if;
+ when dec_u_less_than =>
+ -- a=Pop(), b=Pop(), Push(a<b ? 1 : 0)
+ if mem_busy_i='0' and ENA_LEVEL1 then
+ DoBinOpBool(a_r<b_r,state,sp_r,addr_r,read_en_r,
+ a_r,bin_op_res1_r,BINOP_PIPE);
+ end if;
+ when dec_u_less_than_or_equal =>
+ -- a=Pop(), b=Pop(), Push(a<=b ? 1 : 0)
+ if mem_busy_i='0' and ENA_LEVEL2 then
+ DoBinOpBool(a_r<=b_r,state,sp_r,addr_r,read_en_r,
+ a_r,bin_op_res1_r,BINOP_PIPE);
+ end if;
+ when dec_less_than =>
+ -- a=signed(Pop()), b=signed(Pop()), Push(a<b ? 1 : 0)
+ if mem_busy_i='0' and ENA_LEVEL1 then
+ DoBinOpBool(signed(a_r)<signed(b_r),state,sp_r,
+ addr_r,read_en_r,a_r,bin_op_res1_r,
+ BINOP_PIPE);
+ end if;
+ when dec_less_than_or_equal =>
+ -- a=signed(Pop()), b=signed(Pop()), Push(a<=b ? 1 : 0)
+ if mem_busy_i='0' and ENA_LEVEL2 then
+ DoBinOpBool(signed(a_r)<=signed(b_r),state,sp_r,
+ addr_r,read_en_r,a_r,bin_op_res1_r,
+ BINOP_PIPE);
+ end if;
+ when dec_load =>
+ -- Push([Pop()])
+ if mem_busy_i='0' then
+ state <= st_load2;
+ addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
+ read_en_r <= '1';
+ pc_r <= pc_r+1;
+ end if;
+ when dec_dup =>
+ -- t=Pop(), Push(t), Push(t)
+ if mem_busy_i='0' then
+ pc_r <= pc_r+1;
+ -- A is dupped, no change
+ Push(sp_r,a_r,b_r);
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ end if;
+ when dec_dup_stk_b =>
+ -- Pop(), t=Pop(), Push(t), Push(t), Push(t)
+ if mem_busy_i='0' then
+ pc_r <= pc_r+1;
+ a_r <= b_r;
+ -- B goes to A
+ Push(sp_r,a_r,b_r);
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ end if;
+ when dec_store =>
+ -- a=Pop(), b=Pop(), [a]=b
+ if mem_busy_i='0' then
+ state <= st_resync;
+ pc_r <= pc_r+1;
+ addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
+ data_o <= b_r;
+ write_en_r <= '1';
+ sp_r <= inc_inc_sp;
+ end if;
+ when dec_pop_sp =>
+ -- SP=Pop()
+ if mem_busy_i='0' then
+ FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
+ state <= st_resync;
+ pc_r <= pc_r+1;
+ sp_r <= a_r(ADDR_W-1 downto BYTE_BITS);
+ end if;
+ when dec_nop =>
+ pc_r <= pc_r+1;
+ when dec_not =>
+ -- Push(not(Pop()))
+ pc_r <= pc_r+1;
+ a_r <= not a_r;
+ when dec_flip =>
+ -- Push(flip(Pop()))
+ pc_r <= pc_r+1;
+ for i in 0 to WORD_SIZE-1 loop
+ a_r(i) <= a_r(WORD_SIZE-1-i);
+ end loop;
+ when dec_add_top =>
+ -- a=Pop(), b=Pop(), Push(b), Push(a+b)
+ pc_r <= pc_r+1;
+ a_r <= a_r+b_r;
+ when dec_shift =>
+ -- Push(Pop()<<1) [equivalent to a=Pop(), Push(a+a)]
+ pc_r <= pc_r+1;
+ a_r(WORD_SIZE-1 downto 1) <= a_r(WORD_SIZE-2 downto 0);
+ a_r(0) <= '0';
+ when dec_push_sp_add =>
+ -- Push(Pop()+SP)
+ if ENA_LEVEL0 then
+ pc_r <= pc_r+1;
+ a_r <= (others => '0');
+ a_r(ADDR_W-1 downto BYTE_BITS) <=
+ a_r(ADDR_W-1-BYTE_BITS downto 0)+sp_r;
+ end if;
+ when dec_neq_branch =>
+ -- a=Pop(), b=Pop(), PC+=b==0 ? 1 : a
+ -- Branches are almost always taken as they form loops
+ if ENA_LEVEL0 then
+ sp_r <= inc_inc_sp;
+ -- Need to fetch stack again.
+ state <= st_resync;
+ if b_r/=0 then
+ pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
+ else
+ pc_r <= pc_r+1;
+ end if;
+ end if;
+ when dec_mult =>
+ -- Push(Pop()*Pop())
+ if ENA_LEVEL1 then
+ if MULT_PIPE then
+ mult_a_r <= a_r;
+ mult_b_r <= b_r;
+ state <= st_mult2;
+ else
+ mult_res:=a_r*b_r;
+ mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
+ state <= st_mult5;
+ end if;
+ end if;
+ when dec_break =>
+ -- Assert the break_o signal
+ --report "Break instruction encountered" severity failure;
+ break_o <= '1';
+ pc_r <= pc_r+1;
+ when dec_loadb =>
+ -- Push([Pop()] & 0xFF) (byte address)
+ if mem_busy_i='0' and ENA_LEVEL0 then
+ state <= st_loadb2;
+ addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
+ read_en_r <= '1';
+ pc_r <= pc_r+1;
+ end if;
+ when dec_storeb =>
+ -- [Pop()]=Pop() & 0xFF (byte address)
+ if mem_busy_i='0' and ENA_LEVEL1 then
+ state <= st_storeb2;
+ addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
+ read_en_r <= '1';
+ pc_r <= pc_r+1;
+ end if;
+ when dec_lshr =>
+ -- a=Pop(), b=Pop(), Push(b>>(a&0x3F))
+ if ENA_LSHR then
+ -- This instruction takes more than one cycle.
+ -- We must avoid duplications in the trace log.
+ dbg_o.b_inst <= not_lshr;
+ not_lshr:='0';
+ if a_r(5 downto 0)=0 then -- Only 6 bits used
+ -- No more shifts
+ if mem_busy_i='0' then
+ state <= st_popped;
+ a_r <= b_r;
+ read_en_r <= '1';
+ addr_r <= inc_inc_sp;
+ sp_r <= inc_sp;
+ not_lshr:='1';
+ end if;
+ else -- More shifts needed
+ b_r <= "0"&b_r(WORD_SIZE-1 downto 1);
+ a_r(5 downto 0) <= a_r(5 downto 0)-1;
+ insn <= insn;
+ end if;
+ end if;
+ when others =>
+ -- Undefined behavior, we shouldn't get here.
+ -- It only helps synthesis tools.
+ sp_r <= (others => D_CARE_VAL);
+ report "Illegal decode instruction?!" severity failure;
+ --break_o <= '1';
+ end case;
+ -- The followup of operations that takes more than one execution clock
+ when st_store_sp2 =>
+ if mem_busy_i='0' then
+ addr_r <= inc_sp;
+ read_en_r <= '1';
+ state <= st_popped;
+ end if;
+ when st_load_sp2 =>
+ if mem_busy_i='0' then
+ state <= st_load_sp3;
+ -- Now we can read SP+Offset (SP already decremented)
+ read_en_r <= '1';
+ addr_r <= sp_r+sp_offset+1;
+ end if;
+ when st_load_sp3 =>
+ if mem_busy_i='0' then
+ -- Note: We can't increment PC in the decode stage
+ -- because it will modify sp_offset.
+ pc_r <= pc_r+1;
+ -- Finally we have the result in A
+ state <= st_execute;
+ a_r <= data_i;
+ end if;
+ when st_add_sp2 =>
+ if mem_busy_i='0' then
+ state <= st_execute;
+ a_r <= a_r+data_i;
+ end if;
+ when st_load2 =>
+ if mem_busy_i='0' then
+ a_r <= data_i;
+ state <= st_execute;
+ end if;
+ when st_loadb2 =>
+ if mem_busy_i='0' then
+ a_r <= (others => '0');
+ -- Select the source bits using the less significant bits (byte address)
+ h_bit:=(WORD_BYTES-to_integer(a_r(BYTE_BITS-1 downto 0)))*8-1;
+ l_bit:=h_bit-7;
+ a_r(7 downto 0) <= data_i(h_bit downto l_bit);
+ state <= st_execute;
+ end if;
+ when st_storeb2 =>
+ if mem_busy_i='0' then
+ addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
+ data_o <= data_i;
+ -- Select the source bits using the less significant bits (byte address)
+ h_bit:=(WORD_BYTES-to_integer(a_r(BYTE_BITS-1 downto 0)))*8-1;
+ l_bit:=h_bit-7;
+ data_o(h_bit downto l_bit) <= b_r(7 downto 0);
+ write_en_r <= '1';
+ sp_r <= inc_inc_sp;
+ state <= st_resync;
+ end if;
+ when st_fetch =>
+ if mem_busy_i='0' then
+ addr_r <= pc_r(ADDR_W-1 downto BYTE_BITS);
+ read_en_r <= '1';
+ state <= st_decode;
+ end if;
+ -- The following states can be used to leave cycles free for
+ -- tools that can automagically decompose the multiplication
+ -- in various stages. Xilinx tools can do it to increase the
+ -- multipliers performance.
+ when st_mult2 =>
+ state <= st_mult3;
+ when st_mult3 =>
+ state <= st_mult4;
+ when st_mult4 =>
+ state <= st_mult5;
+ when st_mult5 =>
+ if mem_busy_i='0' then
+ if MULT_PIPE then
+ a_r <= mult_res3_r;
+ else
+ a_r <= mult_res1_r;
+ end if;
+ read_en_r <= '1';
+ addr_r <= inc_inc_sp;
+ sp_r <= inc_sp;
+ state <= st_popped;
+ end if;
+ when st_binary_op_res =>
+ -- BINOP_PIPE=2
+ state <= st_binary_op_res2;
+ when st_binary_op_res2 =>
+ -- BINOP_PIPE>=1
+ read_en_r <= '1';
+ addr_r <= inc_inc_sp;
+ sp_r <= inc_sp;
+ state <= st_popped;
+ if BINOP_PIPE=2 then
+ a_r <= bin_op_res2_r;
+ else -- 1
+ a_r <= bin_op_res1_r;
+ end if;
+ when st_popped =>
+ if mem_busy_i='0' then
+ -- Note: Moving this PC++ to the decoder seems to
+ -- consume more LUTs.
+ pc_r <= pc_r+1;
+ b_r <= data_i;
+ state <= st_execute;
+ end if;
+ when others =>
+ -- Undefined behavior, we shouldn't get here.
+ -- It only helps synthesis tools.
+ sp_r <= (others => D_CARE_VAL);
+ report "Illegal state?!" severity failure;
+ --break_o <= '1';
+ end case; -- state
+ end if; -- else reset_i='1'
+ end if; -- rising_edge(clk_i)
+ end process opcode_control;
+end architecture Behave; -- Entity: ZPUMediumCore
+
OpenPOWER on IntegriCloud