// (C)2014-15 Pratik Jawanpuria, Maksim Lapin, Matthias Hein and Bernt Schiele
// Machine Learning Group, Saarland University, Germany
// http://www.ml.uni-saarland.de
// Computer Vision and Multimodal Computing, Max-Planck-Institut für Informatik, Germany
// https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/

#include "mex.h"
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <algorithm>
#include <math.h>
#include <assert.h>
#include "sdca/eokl_mol_sdca_squared_pnorm.h"
#include "blas.h"
#include "sdca/mt19937ar.h"

mxArray * createInfoStruct(SolverStatus status_,double cpu_time_,double wall_time_,int n,int T,double eta, double weight_pos,double weight_neg,double primal_obj,double dual_obj,
	double abs_duality_gap,double rel_duality_gap,double primal_loss_,double dual_loss_,double regularizer_,double epsilon,int epoch_counter,int MaxNumEpoch,int CheckGapFrequency,double seed){
  void const *fields [] = {
    "Solver", mxCreateString("FMTL-SDCA-2norm-Hinge"),
    "Status", mxCreateScalar(static_cast<double>(status_)),
    "StatusName", mxCreateString(status_to_string(status_)),
    "CpuTime", mxCreateScalar(static_cast<double>(cpu_time_)),
    "WallTime", mxCreateScalar(static_cast<double>(wall_time_)),
    "NumExamples", mxCreateScalar(static_cast<double>(n)),
    "NumTasks", mxCreateScalar(static_cast<double>(T)),
    "Eta", mxCreateScalar(static_cast<double>(eta)),
    "Weight_Pos", mxCreateScalar(static_cast<double>(weight_pos)),
    "Weight_Neg", mxCreateScalar(static_cast<double>(weight_neg)),
    "Primal", mxCreateScalar(static_cast<double>(primal_obj)),
    "Dual", mxCreateScalar(static_cast<double>(dual_obj)),
    "AbsoluteGap", mxCreateScalar(static_cast<double>(abs_duality_gap)),
    "RelativeGap", mxCreateScalar(static_cast<double>(rel_duality_gap)),
    "PrimalLoss", mxCreateScalar(static_cast<double>(primal_loss_)),
    "DualLoss", mxCreateScalar(static_cast<double>(dual_loss_)),
    "Regularizer", mxCreateScalar(static_cast<double>(regularizer_)),
    "Epsilon", mxCreateScalar(static_cast<double>(epsilon)),
    "Epoch", mxCreateScalar(static_cast<double>(epoch_counter)),
    "MaxNumEpoch", mxCreateScalar(static_cast<double>(MaxNumEpoch)),
    "CheckGapFrequency", mxCreateScalar(static_cast<double>(CheckGapFrequency)),
    "Seed", mxCreateScalar(static_cast<double>(seed)),
    0, 0
  };
  return createScalarStructArray(fields);
}

void initialize_vectormatrix_zero(double *B, int row, int col){
    for (int i=0;i<col;i++)
        for (int j=0;j<row;j++)
            B[i*row+j] = 0.0;
}

void create_Y_matrix(double *Y_matrix,double *Y,int T, int n, double NegLabel, double PosLabel){
    for (int j=0;j<n;j++){
	for (int t=0;t<T;t++){
            if (Y[j]==(double)(t+1))
                Y_matrix[t+T*j]=PosLabel;
            else
                Y_matrix[t+T*j]=NegLabel;
        }
    }
}

void create_Y_matrix(double *Y_matrix,double *Y,int T, int n){
    for (int j=0;j<n;j++){
	for (int t=0;t<T;t++){
            if (Y[j]==(double)(t+1))
                Y_matrix[t+T*j]=1.0;
            else
                Y_matrix[t+T*j]=-1.0;
        }
    }
}

void sdca_squared_2norm(double *Y,double *K_,int n,int T,double eta,double weight_pos,double weight_neg,int MaxNumEpoch,double epsilon_,int CheckGapFrequency,double seed,double *A_,double *Theta_, 
			mxArray **pInfo, double *primalObjArray,double *dualObjArray,double *cpu_time_Array,double *wall_time_Array,int max_length_vector){
    // loop variables
    bool flag = true;
    int epoch_counter = 0, instance_index;
    int ione = 1;
    double zero = 0.0;
    double bone = 1.0;
    double PosLabel = 1.0;
    ptrdiff_t one = 1;
    ptrdiff_t ptrdiff_t_n = static_cast<ptrdiff_t>(n);
    ptrdiff_t ptrdiff_t_T = static_cast<ptrdiff_t>(T);
    ptrdiff_t ptrdiff_t_TT = static_cast<ptrdiff_t>(T*T);
    bool force_check_gap_, variables_changed_, recompute_gap_=false, check_now, endTraining = false;
    double diff, max_obj;
    SolverStatus status_ = SolverStatus::kNone;
    int obj_compute_counter = 0;
    
    // data variables
    int num_elements_ = n*T;
    double *Y_matrix = new double[num_elements_];
    double label;
    ptrdiff_t ptrdiff_t_nT = static_cast<ptrdiff_t>(num_elements_);
    
    // cache variables
    double *Theta_t_;
    double *AKi_ = new double[T];
    double *KAt_ = new double[n];
    double *AtK_ = new double[num_elements_];
    double *Theta_AtK_ = new double[num_elements_];
    double two_eta_ = 2.0*eta;
    double AKi_AKi_;
    int instance_, task;
    double *Ki_;
    double AKi_task;

    // optimization variables
    unsigned int *one_to_n_array= new unsigned int[n];
    unsigned int *one_to_T_array= new unsigned int[T];
    double dual_obj, primal_obj, rel_duality_gap, abs_duality_gap, dual_objective_old_;
    double Kii_, a, b, c, d, delta, delta2, AKi_t_delta;
    double primal_loss_, dual_loss_, coeff, alpha_old,AKi_Theta_t;
    double sum_alpha_i_yi, regularizer_;
    //double max_delta = -std::numeric_limits<double>::infinity();
    //double min_delta = std::numeric_limits<double>::infinity();
    double lb, ub, temp;
    double primal_y,primal_t,primal_c;
    
    coeff = 1.0 / two_eta_;
    
    //constants
    char kTranspose = 'T';
    char kNoTranspose = 'N';
    
    // time variables
    double walltime0=0.0,cputime0=0.0;
    double cpu_time_=0.0, wall_time_=0.0;
    #ifdef PIECEWISE_TIME
    double walltime_random0=0.0,cputime_random0=0.0,walltime_random1=0.0,cputime_random1=0.0;
    double walltime_root0=0.0,cputime_root0=0.0,walltime_root1=0.0,cputime_root1=0.0;
    double walltime_aTQa0=0.0,cputime_aTQa0=0.0,walltime_aTQa1=0.0,cputime_aTQa1=0.0;
    double walltime_Qalpha0=0.0,cputime_Qalpha0=0.0,walltime_Qalpha1=0.0,cputime_Qalpha1=0.0;
    #endif
    #ifdef GAP_TIME
    double walltime_checkGap0=0.0,cputime_checkGap0=0.0,walltime_checkGap1=0.0,cputime_checkGap1=0.0;
    #endif
    
    #ifdef VERBOSE
      std::cout << "MTLOVAHINGESDCA::Start(" <<
	std::scientific << std::setprecision(16) <<
	"num_examples: " << n << ", "
	"num_tasks: " << T << ", "
	"eta: " << eta << ", "
 	"pos_weight: " << weight_pos << ", "
 	"neg_weight: " << weight_neg << ", "
	"epsilon: " << epsilon_ << ", "
	"max_num_epoch: " << MaxNumEpoch << ", "
	"check_gap_frequency: " << CheckGapFrequency << ", "
	"seed: " << seed << ")" << std::endl;
    #endif

    primal_obj = std::numeric_limits<double>::infinity();
    dual_obj = -std::numeric_limits<double>::infinity();
    abs_duality_gap = std::numeric_limits<double>::infinity();
    rel_duality_gap = std::numeric_limits<double>::infinity();
    dual_objective_old_ = -std::numeric_limits<double>::infinity();
    primal_loss_ = + std::numeric_limits<double>::infinity();
    dual_loss_ = - std::numeric_limits<double>::infinity();
    regularizer_ = 0.0;
    status_ = SolverStatus::kTraining;    
  
    create_Y_matrix(Y_matrix,Y,T,n);
    initialize_vectormatrix_zero(primalObjArray,max_length_vector,ione);
    initialize_vectormatrix_zero(dualObjArray,max_length_vector,ione);
    initialize_vectormatrix_zero(cpu_time_Array,max_length_vector,ione);
    initialize_vectormatrix_zero(wall_time_Array,max_length_vector,ione);
    sdca::init_genrand(static_cast<unsigned long>(seed));

    walltime0 = get_wall_time();
    cputime0 = get_cpu_time();
    
    for (int i=0; i<n;i++)
        one_to_n_array[i]=i;
    for (int t=0; t<T;t++)
        one_to_T_array[t]=t;
    initialize_vectormatrix_zero(A_,T,n);
    initialize_vectormatrix_zero(Theta_,T,T);
    while (flag){
        epoch_counter++;
	force_check_gap_ = false;
	variables_changed_ = false;
	#ifdef PIECEWISE_TIME
        walltime_random0 = get_wall_time();
        cputime_random0 = get_cpu_time();
	#endif

        sdca::rand_permute(one_to_n_array, n);
	
	#ifdef PIECEWISE_TIME
        walltime_random1 += get_wall_time() - walltime_random0;
        cputime_random1 += get_cpu_time() - cputime_random0;
	#endif
        for (int i=0;i<n;i++){
            instance_ =  one_to_n_array[i];
	    Ki_ = K_+n*instance_;
            Kii_ = Ki_[instance_];
	    #ifdef PIECEWISE_TIME
	    walltime_Qalpha0 = get_wall_time();
	    cputime_Qalpha0 = get_cpu_time();
	    #endif

	    dgemv(&kNoTranspose, &ptrdiff_t_T, &ptrdiff_t_n, &bone, A_, &ptrdiff_t_T, Ki_, &one, &zero, AKi_, &one);

	    #ifdef PIECEWISE_TIME
	    walltime_Qalpha1 += get_wall_time() - walltime_Qalpha0;
	    cputime_Qalpha1 += get_cpu_time() - cputime_Qalpha0;
	    #endif
	    
	    #ifdef PIECEWISE_TIME
	    walltime_root0 = get_wall_time();
	    cputime_root0 = get_cpu_time();
	    #endif

	    AKi_AKi_ = ddot(&ptrdiff_t_T, AKi_, &one, AKi_, &one);

	    #ifdef PIECEWISE_TIME
	    walltime_root1 += get_wall_time() - walltime_root0;
	    cputime_root1 += get_cpu_time() - cputime_root0;
	    #endif
	    
	    for (int t=0;t<T;t++){
	      task = one_to_T_array[t];
	      instance_index = task + T*instance_;
	      label = Y_matrix[instance_index];
	      Theta_t_ = Theta_ + T*task;
              #ifdef PIECEWISE_TIME
	      walltime_aTQa0 = get_wall_time();
	      cputime_aTQa0 = get_cpu_time();
	      #endif
	      if (one_to_T_array[0]==0){
       	          ptrdiff_t ptrdiff_t_t = static_cast<ptrdiff_t>(task);
	          dcopy(&ptrdiff_t_t, Theta_+ task, &ptrdiff_t_T, Theta_t_, &one);
	      }
	      else{
	          ptrdiff_t ptrdiff_t_trev = static_cast<ptrdiff_t>(T-1-task);
	          dcopy(&ptrdiff_t_trev, Theta_t_+task+T, &ptrdiff_t_T, Theta_t_+task+1, &one);
	      }
              #ifdef PIECEWISE_TIME
	      walltime_aTQa1 += get_wall_time() - walltime_aTQa0;
	      cputime_aTQa1 += get_cpu_time() - cputime_aTQa0;
	      #endif

	      alpha_old = A_[instance_index];

	      #ifdef PIECEWISE_TIME
	      walltime_root0 = get_wall_time();
	      cputime_root0 = get_cpu_time();
	      #endif
	      AKi_task = AKi_[task];

	      AKi_Theta_t = ddot(&ptrdiff_t_T, AKi_, &one, Theta_t_, &one);
	      a = Kii_ * Kii_;
	      b = (4.0) * (Kii_ * AKi_task);
	      c = (2.0) * (Kii_ * Theta_t_[task] + AKi_task * AKi_task + AKi_AKi_);
	      d = (4.0) * (AKi_Theta_t + two_eta_ * (- label));
	      if (label==PosLabel){
		lb = -alpha_old;
		ub = weight_pos - alpha_old;
	      }else{
		lb = -weight_neg - alpha_old;
		ub = -alpha_old;
	      }
	      delta = MinQuartic(a, b, c, d, lb, ub);
	      #ifdef PIECEWISE_TIME
	      walltime_root1 += get_wall_time() - walltime_root0;
	      cputime_root1 += get_cpu_time() - cputime_root0;
	      #endif

	      // Apply the update
	      if (delta != (0.0)) {
		variables_changed_ = true;
		// Order is important! AKi must be 'old' here
		delta2 = delta * delta;
		AKi_t_delta = AKi_task * delta;

		#ifdef PIECEWISE_TIME
		walltime_aTQa0 = get_wall_time();
		cputime_aTQa0 = get_cpu_time();
		#endif
		//UpdateTheta(delta, task);
		daxpy(&ptrdiff_t_T, &delta, AKi_, &one, Theta_t_, &one);
		Theta_t_[task] += Kii_ * delta2 + AKi_t_delta; // completes the update
		#ifdef PIECEWISE_TIME
		walltime_aTQa1 += get_wall_time() - walltime_aTQa0;
		cputime_aTQa1 += get_cpu_time() - cputime_aTQa0;
		#endif

		#ifdef PIECEWISE_TIME
		walltime_Qalpha0 = get_wall_time();
		cputime_Qalpha0 = get_cpu_time();
		#endif
		AKi_AKi_ += (2.0) * AKi_t_delta * Kii_ + Kii_*Kii_*delta2;
		AKi_[task] += Kii_ * delta;
		A_[instance_index] += delta;
		#ifdef PIECEWISE_TIME
		walltime_Qalpha1 += get_wall_time() - walltime_Qalpha0;
		cputime_Qalpha1 += get_cpu_time() - cputime_Qalpha0;
		#endif
		// Check if the update is too close to machine precision
		//max_old_new = std::max(std::abs(alpha_old), std::abs(A_[instance_index]));
		//if (std::abs(delta) < std::numeric_limits<double>::epsilon() * max_old_new) {
		//  force_check_gap_ = true;
		//}
	      }
	    }
    	    std::reverse(&one_to_T_array[0],&one_to_T_array[T]);
        }
        
	if (variables_changed_) {
	  recompute_gap_ = true;
	} else {
	  status_ = SolverStatus::kNoProgress;
	  endTraining = true;
	  flag = false;
	}
        
        check_now = CheckGapFrequency > 0 && epoch_counter % CheckGapFrequency == 0;
        if ( check_now || force_check_gap_ || ( (epoch_counter>=MaxNumEpoch || endTraining) &&  recompute_gap_) ){
            #ifdef PIECEWISE_TIME
            printf("walltime_random1: %e, cputime_random1: %e\n",walltime_random1,cputime_random1);
            printf("walltime_root1: %e, cputime_root1: %e\n",walltime_root1,cputime_root1);
            printf("walltime_aTQa1: %e, cputime_aTQa1: %e\n",walltime_aTQa1,cputime_aTQa1);
            printf("walltime_Qalpha1: %e, cputime_Qalpha1: %e\n",walltime_Qalpha1,cputime_Qalpha1);
            walltime_random1 = 0.0;cputime_random1=0.0;walltime_root1=0.0;cputime_root1=0.0;
            walltime_aTQa1=0.0;cputime_aTQa1=0.0;walltime_Qalpha1=0.0;cputime_Qalpha1=0.0;
	    #endif
	    #ifdef GAP_TIME
	    walltime_checkGap0 = get_wall_time();
	    cputime_checkGap0 = get_cpu_time();
	    #endif
        
	    // Compute the primal and the dual objectives
	    primal_loss_ = 0.0;
	    dual_loss_ = 0.0;

	    sum_alpha_i_yi = 0.0;
	    
	    dgemm(&kNoTranspose, &kNoTranspose, &ptrdiff_t_T, &ptrdiff_t_n, &ptrdiff_t_n, &bone, A_,&ptrdiff_t_T, K_, &ptrdiff_t_n, &zero, AtK_, &ptrdiff_t_T);
	    dgemm(&kNoTranspose, &kTranspose, &ptrdiff_t_T, &ptrdiff_t_T, &ptrdiff_t_n, &bone, AtK_,&ptrdiff_t_T, A_, &ptrdiff_t_T, &zero, Theta_, &ptrdiff_t_T);
	    
	    regularizer_ = coeff *  ddot(&ptrdiff_t_TT, Theta_, &one, Theta_, &one);
	    
	    dgemm(&kNoTranspose, &kNoTranspose, &ptrdiff_t_T, &ptrdiff_t_n, &ptrdiff_t_T , &coeff, Theta_, &ptrdiff_t_T, AtK_, &ptrdiff_t_T, &zero, Theta_AtK_, &ptrdiff_t_T);
	    primal_c = 0.0;
	    for (int i=0;i<n;i++){
	      for (int t=0;t<T;t++){ 
		temp = 1.0 - Theta_AtK_[t+i*T]*Y_matrix[t+i*T];
		if (temp > 0.0){
		  if (Y_matrix[t+i*T] >0.0)
		    temp = weight_pos*temp;
		    //primal_loss_ += weight_pos*temp;
		  else
		    temp = weight_neg*temp;
		    //primal_loss_ += weight_neg*temp;
		  // Kahan summation algorithm
		  primal_y = temp - primal_c;
		  primal_t = primal_loss_ + primal_y;
		  primal_c = (primal_t - primal_loss_) - primal_y;
		  primal_loss_ = primal_t;
		}
	      }
	    }

	    sum_alpha_i_yi = ddot(&ptrdiff_t_nT, A_, &one, Y_matrix, &one);
	    
	    dual_loss_ = sum_alpha_i_yi;

	    // Re-scale all terms by 1/(n*T)
	    regularizer_ /= num_elements_;
	    primal_loss_ /= num_elements_;
	    dual_loss_ /= num_elements_;
	    
            primal_obj = primal_loss_ + 0.75 * regularizer_;
	    dual_obj = dual_loss_ - 0.25 * regularizer_;
            abs_duality_gap = (primal_obj - dual_obj);
            rel_duality_gap = abs_duality_gap/primal_obj;
	    cpu_time_ = get_cpu_time()-cputime0;
	    wall_time_ = get_wall_time()-walltime0;
	    #ifdef GAP_TIME
	    walltime_checkGap1 += get_wall_time() - walltime_checkGap0;
	    cputime_checkGap1 += get_cpu_time() - cputime_checkGap0;
	    printf("walltime_checkGap1: %e, cputime_checkGap1: %e\n",walltime_checkGap1,cputime_checkGap1);
	    #endif

	    #ifdef VERBOSE
	      std::cout << "  "
		"epoch: " << std::setw(4) << epoch_counter << std::setw(0) << ", "
		"primal: " << primal_obj << ", "
		"dual: " << dual_obj << ", "
		"absolute_gap: " << abs_duality_gap << ", "
		"relative_gap: " << rel_duality_gap << ", "
		"primal_loss: " << primal_loss_ << ", "
		"dual_loss: " << dual_loss_ << ", "
		"regularizer: " << regularizer_ << ", "
		"cpu_time: " << cpu_time_ << ", "
		"wall_time: " << wall_time_ << std::endl;
	    #endif
	    primalObjArray[obj_compute_counter] = primal_obj;
	    dualObjArray[obj_compute_counter] = dual_obj;
	    cpu_time_Array[obj_compute_counter] = cpu_time_;
	    wall_time_Array[obj_compute_counter] = wall_time_;
	    ++obj_compute_counter;
	    
	    diff = abs_duality_gap;
	    max_obj = std::max(std::abs(primal_obj), std::abs(dual_obj));

	    if (diff < epsilon_ * max_obj) {
	      status_ = SolverStatus::kConverged;
	      flag = false;
	      break;
	    } else if (diff <= std::numeric_limits<double>::epsilon() * max_obj) {
	      status_ = SolverStatus::kConvergedMachinePrecision;
	      flag = false;
	      break;
	    }

	    // (Theoretically) the dual objective should not decrease
	    if (dual_obj < dual_objective_old_
		- std::numeric_limits<double>::epsilon() * std::abs(dual_obj)) {
	      #ifdef VERBOSE
		  std::cout << "  "
		    "Warning: the dual objective decreased by: " <<
		    dual_objective_old_ - dual_obj << std::endl;
	      #endif
	      status_ = SolverStatus::kNumericalProblems;
	    }
	    dual_objective_old_ = dual_obj;

	    if (epoch_counter>=MaxNumEpoch){
	      status_ = SolverStatus::kMaxNumEpoch;
	      flag = false;
	      break;
	    }
	    recompute_gap_ = false;
	    force_check_gap_ = false;
	}
    }
    cpu_time_ = get_cpu_time()-cputime0;
    wall_time_ = get_wall_time()-walltime0;

    #ifdef VERBOSE
      std::cout << "MTLOVAHINGESDCA::End("
	"status: " << status_to_string(status_) << ", "
	"epoch: " << epoch_counter << ", "
	"relative_gap: " << rel_duality_gap << ", "
	"absolute_gap: " << abs_duality_gap << ", "
	"primal: " << primal_obj << ", "
	"dual: " << dual_obj << ", "
	"cpu_time: " << cpu_time_ << ", "
	"wall_time: " << wall_time_ << ")" << std::endl;
    #endif
    primalObjArray[obj_compute_counter] = primal_obj;
    dualObjArray[obj_compute_counter] = dual_obj;
    cpu_time_Array[obj_compute_counter] = cpu_time_;
    wall_time_Array[obj_compute_counter] = wall_time_;
    #ifdef COMPLETE_TIME
    printf("Total wall time: %e, total cpu time: %e\n",get_wall_time()-walltime0, get_cpu_time()-cputime0);
    #endif
    *pInfo = createInfoStruct(status_,cpu_time_,wall_time_,n,T,eta,weight_pos,weight_neg,primal_obj,dual_obj,abs_duality_gap,rel_duality_gap,
	primal_loss_,dual_loss_,regularizer_,epsilon_,epoch_counter,MaxNumEpoch,CheckGapFrequency,seed);

    delete[] one_to_n_array;
    delete[] one_to_T_array;
    delete[] Y_matrix;
    delete[] KAt_;
    delete[] AKi_;
    delete[] AtK_;
    delete[] Theta_AtK_;
}


void mexFunction(int nout, mxArray *out[],
/* Output variables */
int nin, const mxArray *in[]) /* Input variables */
{

    enum {IN_Y = 0, IN_K, IN_ETA, IN_EPSILON, IN_CHECKGAP, IN_MAXEPOCH, IN_SEED, IN_W_POS, IN_W_NEG};
    enum {OUT_A = 0, OUT_THETA, OUT_INFO, OUT_END};
    
    /*Declare the size of the inputs*/
    int n, T;
    /*Declare variables for the input arguments.*/
    double *Y = static_cast<double*> (mxGetData(in[IN_Y]));
    n = (int) mxGetM(in[0]);//n=mT
    T = static_cast<int>(Y[n-1]);
    double *K_ = static_cast<double*> (mxGetData(in[IN_K]));
    double eta = mxGetScalar(in[IN_ETA]);
    
    double parameters[] = {1e-2, 10.0, 500.0, 0.0, 1.0, 1.0};
    double epsilon_ = 1e-2;
    int CheckGapFrequency = 10;
    int MaxNumEpoch = 500;
    double seed = 0.0;
    double weight_pos = 1.0;
    double weight_neg = 1.0;
//     double NegLabel = -1.0;
    int shift = 3;
    int varleft = nin-shift;
    int counter = 0;
    int ione = 1;
    while(varleft >0){
        parameters[counter] = mxGetScalar(in[shift+counter]);
	counter++;
	varleft--;
    }
    epsilon_ = parameters[0];
    CheckGapFrequency = static_cast<int>(parameters[1]);
    MaxNumEpoch = static_cast<int>(parameters[2]);
    seed = parameters[3];
    weight_pos = parameters[4];
    weight_neg = parameters[5];
    
    out[0] = mxCreateDoubleMatrix(0,0,mxREAL);
    mxSetM(out[0],T);
    mxSetN(out[0],n);
    mxSetPr(out[0], (double *)mxMalloc(sizeof(double)*n*T));
    double *A_ = mxGetPr(out[0]);
    
    out[1] = mxCreateDoubleMatrix(0,0,mxREAL);
    mxSetM(out[1],T);
    mxSetN(out[1],T);
    mxSetPr(out[1], (double *)mxMalloc(sizeof(double)*T*T));
    double *Theta = mxGetPr(out[1]);
    mxArray *mxInfo = NULL;

    int max_length_vector = static_cast<int>((MaxNumEpoch-1)/CheckGapFrequency + 2);
    out[3] = mxCreateDoubleMatrix(0,0,mxREAL);
    mxSetM(out[3],max_length_vector);
    mxSetN(out[3],ione);
    mxSetPr(out[3], (double *)mxMalloc(sizeof(double)*max_length_vector));
    double *primalObjArray = mxGetPr(out[3]);

    out[4] = mxCreateDoubleMatrix(0,0,mxREAL);
    mxSetM(out[4],max_length_vector);
    mxSetN(out[4],ione);
    mxSetPr(out[4], (double *)mxMalloc(sizeof(double)*max_length_vector));
    double *dualObjArray = mxGetPr(out[4]);
    
    out[5] = mxCreateDoubleMatrix(0,0,mxREAL);
    mxSetM(out[5],max_length_vector);
    mxSetN(out[5],ione);
    mxSetPr(out[5], (double *)mxMalloc(sizeof(double)*max_length_vector));
    double *cpu_time_Array = mxGetPr(out[5]);

    out[6] = mxCreateDoubleMatrix(0,0,mxREAL);
    mxSetM(out[6],max_length_vector);
    mxSetN(out[6],ione);
    mxSetPr(out[6], (double *)mxMalloc(sizeof(double)*max_length_vector));
    double *wall_time_Array = mxGetPr(out[6]);

    sdca_squared_2norm(Y,K_,n,T,eta,weight_pos,weight_neg,MaxNumEpoch,epsilon_,CheckGapFrequency,seed,A_,Theta,&mxInfo,primalObjArray,dualObjArray,cpu_time_Array,wall_time_Array,max_length_vector);
    out[2] = mxInfo;
    return;
}
