%% FUNCTION Least_TGL
% Joint L21 and L01 constraints Feature Learning with Least Squares Loss.
%
%% OBJECTIVE
% argmin_{W=[W_K,W_D]} { sum_i^t (0.5 * norm (Y{i} - X{i}' * W(:, i))^2)
%            + opts.rho_L2 * \|W\|_2^2 + rho1 * \|W_K\|_{2,1} }
%           s.t., sum_j \|W_D^j\|_0 <= v
%
%% INPUT
% Denote t as the number of tasks, d the number of features, n the number
% of samples.
% X: {n * d} * t - input matrix
% Y: {n * 1} * t - output matrix
% rho1: L2,1-norm group Lasso parameter.
% s: upper bound of the selected features under L01 norm constraint 
% (i.e., v in Euquation (6) in the paper.
% f: number of features under L21 regularization.
%% OUTPUT
% W: model: d * t
% funcVal: function value vector.
%
%% LICENSE
%   This program is free software: you can redistribute it and/or modify
%   it under the terms of the GNU General Public License as published by
%   the Free Software Foundation, either version 3 of the License, or
%   (at your option) any later version.
%
%   This program is distributed in the hope that it will be useful,
%   but WITHOUT ANY WARRANTY; without even the implied warranty of
%   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%   GNU General Public License for more details.
%
%   You should have received a copy of the GNU General Public License
%   along with this program.  If not, see <http://www.gnu.org/licenses/>.
%
%   Copyright (C) 2011 - 2012 Jiayu Zhou and Jieping Ye
%
%   You are suggested to first read the Manual.
%   For any problem, please contact with Jiayu Zhou via jiayu.zhou@asu.edu
%
%   Last modified on Mar 3, 2019.
%
%% RELATED PAPERS
%   [1] Liang Zhao, Qian Sun, Jieping Ye, Feng Chen, Chang-Tien Lu, and 
%       Naren Ramakrishnan. “Feature Constrained Multi-Task Learnings for Event 
%       Forecasting in Social Media." TKDE, vol. 29, no. 5, pp. 1059-1072, 
%       May 1 2017.
%   [2] Evgeniou, A. and Pontil, M. Multi-task feature learning, NIPS 2007.
%   [3] Liu, J. and Ye, J. Efficient L1/Lq Norm Regularization, Technical
%       Report, 2010.
%
%% RELATED FUNCTIONS
%  init_opts

%% Code starts here
function [W, funcVal] = Least_L21_c3(X, Y, s, f, rho1, opts)

if nargin <3
    error('\n Inputs: X, Y, rho1, should be specified!\n');
end
X = multi_transpose(X);

if nargin <4
    opts = [];
end

% initialize options.
opts=init_opts(opts);

if isfield(opts, 'rho_L2')
    rho_L2 = opts.rho_L2;
else
    rho_L2 = 0;
end

task_num  = length (X);
dimension = size(X{1}, 1);
funcVal = [];

XY = cell(task_num, 1);
W0_prep = [];
for t_idx = 1: task_num
    XY{t_idx} = X{t_idx}*Y{t_idx};
    W0_prep = cat(2, W0_prep, XY{t_idx});
end

% initialize a starting point
if opts.init==2
    W0 = zeros(dimension, task_num);
elseif opts.init == 0
    W0 = W0_prep;
else
    if isfield(opts,'W0')
        W0=opts.W0;
        if (nnz(size(W0)-[dimension, task_num]))
            error('\n Check the input .W0');
        end
    else
        W0=W0_prep;
    end
end

bFlag=0; % this flag tests whether the gradient step only changes a little


Wz= W0;
Wz_old = W0;

t = 1;
t_old = 0;

iter = 0;
gamma = 1;
gamma_inc = 2;

while iter < opts.maxIter
    alpha = (t_old - 1) /t;
    
    Ws = (1 + alpha) * Wz - alpha * Wz_old;
    
    % compute function value and gradients of the search point
    
    % the following function funVal_eval corresponds to the equation in Page 5
    % of the paper: Multi-Task Learning for Spatio-Temporal Event Forecasting
    % f(W) = \sum_(i=1)^m ||w_^T X_i - Y_i||_F^2 + p_1 ||W||_F^2
    Fs   = funVal_eval (Ws);
    
    % The following function is the gradient of f(W), which is implemented 
    % as funVal_eval(W)
    gWs  = gradVal_eval(Ws);
    

    
    while true
        S = Ws - gWs/gamma;
        Wzp01 = FGLasso_projection_l21(S(1:f,:), rho1/gamma);
        
        
        
        Wzp02 = FGLasso_projection(S(f+1:end,:), s); % the projector function in Line 6 of Alogirthm 1
        Wzp = [Wzp01;Wzp02];
        Fzp = funVal_eval  (Wzp);
        
        
        
        delta_Wzp = Wzp - Ws;
        r_sum = norm(delta_Wzp, 'fro')^2;
        %         Fzp_gamma = Fs + trace(delta_Wzp' * gWs)...
        %             + gamma/2 * norm(delta_Wzp, 'fro')^2;
        Fzp_gamma = Fs + sum(sum(delta_Wzp.* gWs))...
            + gamma/2 * norm(delta_Wzp, 'fro')^2;
        
        if (r_sum <=1e-20)
            bFlag=1; % this shows that, the gradient step makes little improvement
            break;
        end
        
        if (Fzp <= Fzp_gamma) % !!!!!!!!!!!!!!!!!!!!!!!not quite clear why using this criterion !!!!!!!!!!!!!!!!!!!
            break;
        else
            gamma = gamma * gamma_inc;
        end
    end
    
    Wz_old = Wz;
    Wz = Wzp;
    
    funcVal = cat(1, funcVal, Fzp + nonsmooth_eval(Wz, rho1));
    
    if (bFlag)
        % fprintf('\n The program terminates as the gradient step changes the solution very small.');
        break;
    end
    
    % test stop condition.
    switch(opts.tFlag)
        case 0
            if iter>=2
                if (abs( funcVal(end) - funcVal(end-1) ) <= opts.tol)
                    break;
                end
            end
        case 1
            if iter>=2
                if (abs( funcVal(end) - funcVal(end-1) ) <=...
                        opts.tol* funcVal(end-1))
                    break;
                end
            end
        case 2
            if ( funcVal(end)<= opts.tol)
                break;
            end
        case 3
            if iter>=opts.maxIter
                break;
            end
    end
    
    iter = iter + 1;
    t_old = t;
    t = 0.5 * (1 + (1+ 4 * t^2)^0.5);
    
end

W = Wzp;

% private functions

    function [Wp] = FGLasso_projection (W, r)
        % solve it in row wise (L_{2,1} is row coupled).
        % for each row we need to solve the proximal opterator
        % argmin_w { 0.5 \|w - v\|_2^2 } // shall we delete this term? lambda_3 * \|w\|_2 }
        % add constraint here
        % s.t. (1) #selected_groups in first d features <= r; 
        %      (2) #selected_groups in the left features <= s.
        
%         if opts.pFlag
%             parfor i = 1 : size(W, 1)
%                 v = W(i, :);
%                 nm = norm(v, 2);
%                 if nm == 0
%                     w = zeros(size(v));
%                 else
%                     w = max(nm - lambda, 0)/nm * v;
%                 end
%                 Wp(i, :) = w';
%             end
%         else
%             for i = 1 : size(W, 1)
%                 v = W(i, :);
%                 nm = norm(v, 2);
%                 if nm == 0
%                     w = zeros(size(v));
%                 else
%                     w = max(nm - lambda, 0)/nm * v;
%                 end
%                 Wp(i, :) = w';
%             end
%         end
%         
%         W = Wp;
%         Wp = zeros(size(W));
        % based on the above results, for further projection
        Wp = zeros(size(W));

        % based on the above results, for projection
        n = size(W, 1);
        m = size(W, 2);
        
        p1 = zeros(n, 1);
        % for all the features, select top r features.
        for i = 1 : n
            p1(i) = norm(W(i, :), 2);
        end
        [~, ind1] = sort(p1, 'descend');
        for i = 1 : r
            Wp(ind1(i), :) = W(ind1(i), :);
        end
        
    end

    function [Wp] = FGLasso_projection_l21 (W, lambda )
        % solve it in row wise (L_{2,1} is row coupled).
        % for each row we need to solve the proximal opterator
        % argmin_w { 0.5 \|w - v\|_2^2 + lambda_3 * \|w\|_2 }
        
        Wp = zeros(size(W));
        
        if opts.pFlag
            parfor i = 1 : size(W, 1)
                v = W(i, :);
                nm = norm(v, 2);
                if nm == 0
                    w = zeros(size(v));
                else
                    w = max(nm - lambda, 0)/nm * v;
                end
                Wp(i, :) = w';
            end
        else
            for i = 1 : size(W, 1)
                v = W(i, :);
                nm = norm(v, 2);
                if nm == 0
                    w = zeros(size(v));
                else
                    w = max(nm - lambda, 0)/nm * v;
                end
                Wp(i, :) = w';
            end
        end
    end

% smooth part gradient.
    function [grad_W] = gradVal_eval(W)
        if opts.pFlag
            grad_W = zeros(zeros(W));
            parfor i = 1:task_num
                grad_W (i, :) = X{i}*(X{i}' * W(:,i)-Y{i});
            end
        else
            grad_W = [];
            for i = 1:task_num
                grad_W = cat(2, grad_W, X{i}*(X{i}' * W(:,i)-Y{i}) );
            end
        end
        grad_W = grad_W+ rho_L2 * 2 * W;
    end

% smooth part function value.
% the following function funVal_eval corresponds to the equation in Page 5
% of the paper: Multi-Task Learning for Spatio-Temporal Event Forecasting
% f(W) = \sum_(i=1)^m ||w_^T X_i - Y_i||_F^2 + p_1 ||W||_F^2
    function [funcVal] = funVal_eval (W)
        funcVal = 0;
        if opts.pFlag
            parfor i = 1: task_num
                funcVal = funcVal + 0.5 * norm (Y{i} - X{i}' * W(:, i))^2;
            end
        else
            for i = 1: task_num
                funcVal = funcVal + 0.5 * norm (Y{i} - X{i}' * W(:, i))^2;
            end
        end
        funcVal = funcVal + rho_L2 * norm(W,'fro')^2;
    end
% The following function is the gradient of f(W), which is implemented 
% as funVal_eval(W)
    function [non_smooth_value] = nonsmooth_eval(W, rho_1)
        non_smooth_value = 0;
        if opts.pFlag
            parfor i = 1 : size(W, 1)
                w = W(i, :);
                non_smooth_value = non_smooth_value ...
                    + rho_1 * norm(w, 2);
            end
        else
            for i = 1 : size(W, 1)
                w = W(i, :);
                non_smooth_value = non_smooth_value ...
                    + rho_1 * norm(w, 2);
            end
        end
    end
end