disp('Begining.... RLS_TD_CartPole_Final');
%--------------learning parameter definition------------
beta = 0.5; %learning rate of actor
gamma = 0.95;  %discount rate
lambda = 0.6;  %td lambda
mu = 1;     %fogetting rate of RLS
eta = 0.2;  %to initialize P0 = eta * I
k1 = 0.4;   k2 = 0.5;   %param for devariance of normal distribution
%-------------CartPole system definition--------------------------
g=9.8;	length=0.5;
m_cart=1.0;		m_pend=0.1;
total_m = m_cart + m_pend;
momt_pend = m_pend*length;
mu_c = 0.0005;	mu_p = 0.000002;    %friction coefficiency
num_state = 4;
tstep = 0.02;   %time step for simulation

initrange = 0.05;
deg2rad = pi / 180;     rad2deg = 180 / pi; %unit conversion
lower_range = [-12*deg2rad -50*deg2rad -2.4 -1];
higher_range = [12*deg2rad 50*deg2rad 2.4 1];
Max_Theta = 12*deg2rad; Max_Pos = 2.4;
%--------------CMAC network custimization---------------
num_c_tiling = 4;	num_c_partition = 7;	num_c_input = 4; 	num_c_memory = 30; %critic
num_a_tiling = 4;	num_a_partition = 7;	num_a_input = 4;	num_a_memory = 80; %actor
%-----------------Initialization of CMAC network---------------------------
c_interval = (higher_range - lower_range) / num_c_partition;    %intervals for a tiling
c_offset = (0:(num_c_tiling-1)) / num_c_tiling;
tempmx1 = [];   tempmx2 = [];   %temporary matrix variable
for ii = 1:num_c_input
    tempmx1 = [tempmx1 c_offset'];
end
for ii = 1:num_c_tiling
    tempmx2 = [tempmx2; c_interval];
end
c_interval = tempmx2;   %intervals for all tilings
c_offset = tempmx1 .* c_interval;  %finish defining the CMAC tiling's offset
c_num_tiles_per_tiling = num_c_partition^(num_c_input); %number of unit of a tiling

a_interval = (higher_range - lower_range) / num_a_partition;
a_offset = (0:(num_a_tiling-1)) / num_a_tiling;
tempmx1 = [];   tempmx2 = [];   %temporary matrix variable
for ii = 1:num_a_input
    tempmx1 = [tempmx1 a_offset'];
end
for ii = 1:num_a_tiling
    tempmx2 = [tempmx2; a_interval];
end
a_interval = tempmx2;
a_offset = tempmx1 .* a_interval;  %finish defining the CMAC tiling's offset
a_num_tiles_per_tiling = num_a_partition^(num_a_input);

%----------------Allocate the matrix of variables-----------------
c_state = zeros(1,num_c_input);
c_active = zeros(num_c_tiling, num_c_input);
a_state = zeros(1,num_a_input);
a_active = zeros(num_a_tiling, num_a_input);

MaxRuns = 1;
for run = 1:MaxRuns
%-----------------Initialization of variables for each run----------------
    c_weight = zeros(num_c_memory,1);
    a_weight = zeros(num_a_memory,1);
    zt = zeros(num_c_memory,1);     %eligibility trace of critic
    P = eta*eye(num_c_memory);      %RLS variable,  variance matrix
	%initialize System state
    s = rand(1,num_state)*initrange;
	theta = s(1);	theta_vel = s(2);	pos = s(3);		vel = s(4);
	
    trials = 1;
    steps = 0;
    %for recording 
        xx = [];
        ff = [];
    MaxTrials = 200;
	while trials <= MaxTrials
		%----------Calculation of actor output---------------
		a_state = [theta theta_vel pos vel] - lower_range;        
        tempmx1 = [];
        for ii = 1:num_a_tiling
            tempmx1 = [tempmx1; a_state];
        end
		a_active = ceil((tempmx1 - a_offset) ./ a_interval);    %active tile in each tiling
        %Hashing of CMAC network
		a_feature = zeros(num_a_memory,1);
		aa = 0;
		for i = 1:num_a_tiling
            aa = (i-1) * a_num_tiles_per_tiling;
			for j = 1:num_a_input
                if a_active(i,j) <= num_a_partition & a_active(i,j) > 0
    				aa = aa + a_active(i,j) - 1 + num_a_partition^(num_a_input - j);
                end
			end
			aa = rem(aa, num_a_memory) + 1;
			a_feature(aa) = 1;
		end
		y_hat = a_feature' * a_weight;
		
		%------------------Calculation of V(s(t))---------------------------------------
		c_state = [theta theta_vel pos vel] - lower_range;
		tempmx1 = [];
        for ii = 1:num_c_tiling
            tempmx1 = [tempmx1; c_state];
        end
		c_active = ceil((tempmx1 - c_offset) ./ c_interval);
		c_feature = zeros(num_c_memory,1);
		aa = 0;
		for i = 1:num_c_tiling
            aa = (i-1) * c_num_tiles_per_tiling;
			for j = 1:num_c_input
                if c_active(i,j) <= num_c_partition & c_active(i,j) > 0
    				aa = aa + c_active(i,j) - 1 + num_c_partition^(num_c_input - j);
                end
			end
			aa = rem(aa, num_c_memory) + 1;
			c_feature(aa) = 1;
        end		
        
        
		V_t = c_feature' * c_weight;
		
		%calculate according to distribution
		sigma = k1 / (1 + exp(k2*V_t));
		y_t = randn(1,1)*sigma + y_hat;		%actual control action
		
        %---------------Simulation of CartPole system, using Euler method-------------------
        force = y_t * 10;
		if force > 10
			force = 10;
		elseif force < -10
		    force = -10;
        end
		hforce = momt_pend * (theta_vel^2) * sin(theta);
		part_num = -force - hforce + mu_c * sign(vel);
		denom_ang_vel = length * (4/3 - m_pend * ((cos(theta))^2) / total_m);
		num_ang_vel = g*sin(theta) + cos(theta) * part_num / total_m - mu_p * theta_vel / momt_pend;		
		dxdt1 = theta_vel;
		dxdt2 = num_ang_vel / denom_ang_vel;
		num_vel= force + hforce - momt_pend * dxdt2 * cos(theta) - mu_c * sign(vel);		
		dxdt3 = vel;
		dxdt4 = num_vel / total_m;        
		theta = theta + tstep * dxdt1;
		theta_vel = theta_vel + tstep * dxdt2;
		pos = pos + tstep * dxdt3;
		vel = vel + tstep * dxdt4;
        
        %-------------Generate the reward---------------------------------------------------
        if abs(theta) > Max_Theta | abs(pos) > Max_Pos
            rt = -1;
            failed = 1;
        else
            rt = 0;
            failed = 0;
        end
        %--------------Calculation of V(s(t+1))--------------------------------------------
        if failed == 0
			c_state = [theta theta_vel pos vel] - lower_range;			
            tempmx1 = [];
            for ii = 1:num_c_tiling
                tempmx1 = [tempmx1; c_state];
            end
			c_active = ceil((tempmx1 - c_offset) ./ c_interval);
			c_feature_new = zeros(num_c_memory,1);
			aa = 0;
			for i = 1:num_c_tiling
	            aa = (i-1) * c_num_tiles_per_tiling;
				for j = 1:num_c_input
	                if c_active(i,j) <= num_c_partition & c_active(i,j) > 0
	    				aa = aa + c_active(i,j) - 1 + num_c_partition^(num_c_input - j);
	                end
				end
				aa = rem(aa, num_c_memory) + 1;
				c_feature_new(aa) = 1;
	        end		
			V_t_new = c_feature_new' * c_weight;
		else    %if fails, V(s) = 0
			V_t_new = 0;
			c_feature_new = zeros(num_c_memory,1);
		end
        
        %----------Eligibility trace update -------------------------
        zt = gamma*lambda*zt + c_feature;
        %----------TD error------------------------------------------
        r_hat = rt + gamma * V_t_new - V_t;
        %----------Critic update, using RLS_TD(lambda) method--------
        FeatureVector = c_feature' - gamma*c_feature_new';
        tt2 = FeatureVector * P * zt;
        K = P * zt / (mu + tt2);
        c_weight = c_weight + K * r_hat;
        P = P -K*FeatureVector*P;
        P = P / mu;        
        % ---------Critic update, using Kernel LS-TD
        
        
        
        %----------Actor update, using policy gradient
        policygradient = r_hat * (y_t - y_hat) / sigma;
        a_weight = a_weight + beta * policygradient * a_feature / num_a_tiling;
        %----------Next state---------------
        steps = steps + 1;
        ss = [theta theta_vel pos vel];
        xx = [xx; ss];
        ff = [ff; force];
        
        %if failed
        if failed == 1
            if trials == MaxTrials
                figure
                subplot(5,1,1);
                plot(xx(:,1)*rad2deg);
                subplot(5,1,2);
                plot(xx(:,2)*rad2deg);
                subplot(5,1,3);
                plot(xx(:,3));
                subplot(5,1,4);
                plot(xx(:,4));
                subplot(5,1,5);
                plot(ff);
            end
            %------Re-initilization of CartPole state
            s = rand(1,num_state)*initrange;
        	theta = s(1);	theta_vel = s(2);
        	pos = s(3);		vel = s(4);            
            zt = zt*0;  %clear eligibility trace
            xx = [s];
            ff = [];
            if MaxRuns > 1
                if rem(trials,20) == 0 
                    disp(['Trial #' num2str(trials) ' has ' num2str(steps) ' steps...']);
                end
            else
                disp(['Trial #' num2str(trials) ' has ' num2str(steps) ' steps...']);
            end
            steps = 0;
            trials = trials + 1;
        elseif steps >= 12000
            disp(['Success: Trial #' num2str(trials) ' has ' num2str(steps) ' steps...']);
            figure
            subplot(5,1,1);
            plot(xx(:,1)*rad2deg);
            ylabel('theta-degrees');
            subplot(5,1,2);
            plot(xx(:,2)*rad2deg);
            ylabel('theta-dot');
            subplot(5,1,3);
            plot(xx(:,3));
            ylabel('position');
            subplot(5,1,4);
            plot(xx(:,4));
            ylabel('position-dot');
            subplot(5,1,5);
            plot(ff);
            ylabel('force');
            break;      %end training
        elseif rem(steps,10000) == 0
            disp(['It is ' num2str(steps) ' step now...']);
        end
    end %end of "while "
end