--
-- Algorithm taken from Yalu: https://www.mikrocontroller.net/topic/244674#new*/
--

library ieee;
	use ieee.std_logic_1164.all;
	use ieee.numeric_std.all;
	use ieee.math_real.ceil;

entity cube_root is
	generic (
		NUM_BITS : positive := 8
	);
	port (
		clk      : in  std_logic := 'X';
		rst      : in  std_logic := 'X';

		data_in  : in  std_logic_vector(NUM_BITS-1 downto 0) := (others => 'X');
		data_out : out std_logic_vector(NUM_BITS-1 downto 0) := (others => 'X')
	);
end entity;

architecture rtl of cube_root is
	-- We need some additional bits to do proper rounding later
	constant BITS_INTERN   : positive := NUM_BITS + 3;
	constant NUM_STAGES    : positive := integer(ceil(real(BITS_INTERN)/3.0));
	constant NUM_DATA_BITS : positive := 3*NUM_STAGES + 1;

	type pipeline_data_t is record
		y2        : unsigned(NUM_DATA_BITS-1 downto 0);
		root      : unsigned(NUM_DATA_BITS-1 downto 0);
		remainder : unsigned(NUM_DATA_BITS-1 downto 0);
	end record;

	constant pipeline_data_t_rst : pipeline_data_t := (
		y2        => (others => '0'),
		root      => (others => '0'),
		remainder => (others => '0')
	);

	type DATA_ARRAY_T is array (0 to NUM_STAGES-1) of pipeline_data_t;

	signal store_in  : DATA_ARRAY_T := (others => pipeline_data_t_rst);
	signal store_out : DATA_ARRAY_T := (others => pipeline_data_t_rst);
begin

	store_in(0).remainder(NUM_DATA_BITS-1 downto BITS_INTERN) <= (others => '0');
	-- Multiply with 8 (adding three zeros at the bottom) to be able to round later
	store_in(0).remainder(BITS_INTERN-1 downto 0) <= unsigned(data_in) & "000";

	store_in(0).y2   <= (others => '0');
	store_in(0).root <= (others => '0');


	conn_stages : for i in 1 to NUM_STAGES-1 generate
		store_in(i) <= store_out(i-1);
	end generate;

	gen_stages : for i in 0 to NUM_STAGES-1 generate
		constant cubedbit : unsigned(NUM_DATA_BITS-1 downto 0) := to_unsigned(1, NUM_DATA_BITS) sll ((NUM_STAGES-i-1)*3);
	begin
		proc : process(clk, rst)
			variable s : unsigned(NUM_DATA_BITS downto 0) := (others => '0');
		begin
			if rst = '1' then
				s            := (others => '0');
				store_out(i) <= pipeline_data_t_rst;
			elsif rising_edge(clk) then
				s :=   ( "0" &  store_in(i).y2          )
				     + ( "0" & (store_in(i).y2   srl 1) )
				     + ( "0" & (store_in(i).root srl 1) )
				     + ( "0" & (store_in(i).root srl 2) )
				     + ( "0" & cubedbit                 );
				if store_in(i).remainder >= s then
					store_out(i).remainder <= store_in(i).remainder - s(BITS_INTERN-1 downto 0);
					store_out(i).y2        <= (store_in(i).y2   srl 1) + cubedbit + (store_in(i).root srl 1);
					store_out(i).root      <= (store_in(i).root srl 2) + cubedbit;
				else
					store_out(i).remainder <= store_in(i).remainder;
					store_out(i).y2        <= store_in(i).y2   srl 1;
					store_out(i).root      <= store_in(i).root srl 2;
				end if;
			end if;
		end process;
	end generate;

	proc_round : process(clk, rst)
		variable tmp : unsigned(BITS_INTERN-1 downto 0) := (others => '0');
	begin
		if rst = '1' then
			tmp      := (others => '0');
			data_out <= (others => '0');
		elsif rising_edge(clk) then
			-- Input was multiplied by 8, so output must be divided by cuberoot(8)=2.
			-- To get proper rounding, we add 1 first.
			tmp := (store_out(NUM_STAGES-1).root(BITS_INTERN-1 downto 0) + 1) srl 1;
			data_out <= std_logic_vector( tmp(NUM_BITS-1 downto 0) );
		end if;
	end process;

end architecture;