import warnings
import datetime

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown , Math 

sns.set()
warnings.filterwarnings('ignore')


def printmd(string): display(Markdown(string))
def latex(out): printmd(f'{out}')  
def pr(string): printmd('***{}***'.format(string))


iris = pd.read_csv("../../../resources/data/IRIS.csv")  
iris.head()


pr('labels value : ' +str(iris['species'].unique()))


sns.FacetGrid(iris,hue='species',height=6).map(plt.scatter,'sepal_length','sepal_width').add_legend()

<seaborn.axisgrid.FacetGrid at 0x65a094df70>


x_train = iris.drop('species', axis=1)
y_train = pd.get_dummies(iris['species'])
pr('One hode encoding representation')
y_train.head()


x_train, y_train = np.array(x_train), np.array(y_train)
pr('shape X :'+ str(x_train.shape))
pr('shape X :'+ str(y_train.shape))


X_train, X_test, y_train, y_test = train_test_split(x_train,y_train, test_size=0.33, random_state=42) #separats into test and train samples


W = np.array([[ 1.38618464,  1.9151765 , -0.28863154,  0.40849489],
        [ 1.31642223,  0.76753677,  1.1482473 ,  0.74274245],
        [ 0.29739313,  0.31728673,  2.14038423,  1.84876265]]) # define a weight matrix

b = np.array([1.18749764, 1.16215506, 0.6503473 ])  #bias vector (intercept)


def softmax(X, weight, b):
    '''
    perform softmax function
    
    Parameters :
    X : ndarray
       train data
    weight : ndarray
       weght matrix
    b : ndarray 
        bias vector        

    Returns 
       ndarray 
    '''
   
    #dot product between X_data matrix  and tranposed Weight_ matrix added to Bias  gives matrix each z_ij
    Z = X.dot(weight.T)

    #return matrix cosist of exponentials Z input net
    exp_z = np.exp(Z)

    #array contains sum  of every row  (e^z_{ik})
    sums=np.sum(exp_z, axis=1) 
    
    #return softmax(Z)_{ij}
    return (exp_z.T/sums).T 


def accuracy(Y, P):
    '''
    evualate accuracy of dummies variable
    
    Parameters :
    Y_target : ndarray
      actual real values 
    P : ndarray
     predicted values (probability)
    Return
      float
    '''
        
    C =  np.argmax(Y, axis=1)==np.argmax(P, axis=1)
    D = np.where(C==True)
    return len(D[0])/len(C)


pr( "original data label " +r'$ Y_{{M\times N}} :$'.format(X_test.shape))
actual_label = pd.DataFrame(y_train,columns = ['Iris-setosa', 'Iris-versicolor','Iris-virginica'])
actual_label.head()


pr("predicted data label " +r'$ P_{{M\times N}} :$'.format(X_test.shape))
predict = softmax(X_train,W,b)
predict = pd.DataFrame(predict,columns = ['Iris-setosa', 'Iris-versicolor','Iris-virginica'])
predict.head()


a =accuracy(np.array(y_train),np.array(predict))
pr('accuracy : '+str(a))


def gradient_descent(X, y, W, b, step_size):
    ''''
    one iteration(Epoch) perform  gradient descent
    
     Parameters :
    X : ndarray
       train data
    y : ndarray
       target data
    W : ndarray
     weight matrix
    b : ndarray
      bias 
    step_size : float
       gradient descent setting
    
    '''
    P_y = softmax(X,W,b)-y
    W = W - step_size*(P_y.T).dot(X)
    b = b - step_size*np.sum(P_y, axis=0)
    return W,b
    

def train(X , y, max_iter=100,learning_rate=0.1,innitial_value =1, debug_W=None):
    '''
    Train by softmax regression 
    
    Parameters :
    X : ndarray
       train data
    y : ndarray
       target data
    max_iter : int 
        number of epoch (iterations)
        
    debug_W : tuple 
       index of weight parameter for debugging
    Returns 
       W, b : ndarray
         weight and bias
       in debug mode
       W, b, k : ndarray
          weight and bias and debugind parameter 
         
       
    '''
    
    if type(X) != np.ndarray or type(y) != np.ndarray: 
        raise ValueError('X and y must be ndarray')
        
    #init weight and bias
    b = np.full((y.shape[1],),innitial_value)
    W = np.full((y.shape[1], X.shape[1]), innitial_value)
    
    m = X.shape[0] 
    step_size = (1/m)*learning_rate
    
    if debug_W is not None: 
        debug_mode=True 
        debug = W[(debug_W)]
    else :
        debug_mode=False
        
    for i in range(max_iter):
        W,b = gradient_descent(X , y, W, b,step_size)
        if debug_mode: debug = np.append(debug, W[debug_W])   
    
    
    if debug_mode: return W,b,debug
        
    return W,b


W,b = train(np.array(X_train), np.array(y_train),max_iter=100)


print('weight vector :'+ str(W))
print('')
print('bias :'+ str(b))

weight vector :[[ 1.35411091  1.84190926 -0.19105989  0.45297526]
 [ 1.27780858  0.77747064  1.17029213  0.79586893]
 [ 0.36808051  0.3806201   2.02076776  1.75115581]]

bias :[1.17282574 1.13488628 0.69228799]


predict = softmax(np.array(X_test), W,b)
#print(v)
pr('accuracy:  ' +str(accuracy(np.array(y_test), predict))+'%')


def debug_gradient(max_iter, l_rates=(100, 50, 10, 5, 1, 0.1), call_function = train, debug_weight=(1,1), y_size = [-20,20]):    
    '''
    Parameters 
    max_iter : number
        maxumim count of itertation (epoch)
    l_rates : typle
        different learning rates
    deebug_weight : type
       index of weight parameter wich will be debug
    
    '''
    i=-1 
    plt.figure(figsize=(15,5))
    axes =plt.gca()
    axes.set_ylim(y_size)
    axes.set_xlabel(r' $log(number\; of\; iteration)$'.format(max_iter))     
    axes.set_ylabel(r'weight parameter: $W_{}$'.format(debug_weight))    
    for learning_rate in l_rates:             
        i+=1       
        start = datetime.datetime.now()  
        W,b,w_11 = train(X_test,y_test,max_iter=max_iter,learning_rate=learning_rate,debug_W=debug_weight)
        accr = accuracy(y_test, softmax(X_test,W,b))*100 
        time = int((datetime.datetime.now()-start).total_seconds()*1000 )
        x = np.linspace(1, len(w_11) , len(w_11))
        plt.plot(np.log(x),w_11,label=r'$\lambda$ = {} , accr:{} %'.format(learning_rate, accr),color=cmap[i])
        plt.title("Curve of weight estimator $W${} ,{} iterations, time for excution {}ms".format(debug_weight, max_iter, time))
        plt.legend()
    plt.show()    
    
cmap = ['#701b1b','#a7ba1a','#1aba25','#1a75ba','#9a1aba','#7d1aba','#ba1a45']


debug_gradient(10, debug_weight=(2, 2), y_size=[-30,150])


debug_gradient(10,y_size=[-20,30])


debug_gradient(150, debug_weight=(2, 2), y_size=[-20,100])


debug_gradient(150)

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	Iris-setosa	Iris-versicolor	Iris-virginica
0	0.050053	0.777979	0.171968
1	0.001113	0.555302	0.443584
2	0.030317	0.696057	0.273627
3	0.907907	0.091569	0.000524
4	0.000807	0.561198	0.437995

Softmax Regression </h2>

author: daodeiv (David Stankov)

Abstract

Softmaxt definition and how it works?

Optimizaton of Softmax Loss with Gradient Descent (Deep math calculation)

Implementation of Softmax using numpy

Regularization of softmax by learning rate and max iterations

Conclusion

Softmaxt definition and how it works?

Optimizaton of Softmax Loss with Gradient Descent (Deep math calculation)

using common Kronicker $\delta$ proprties

from $\frac{\partial z_{mk}}{\partial z_{mi}}=0$ if $k\ne i\;\frac{\partial z_{mk}}{\partial z_{mi}}=1\;ifk = i\;\Rightarrow \frac{\partial z_{mk}}{\partial z_{mi}}=\delta_{ki} $ plug in

from eq.(1) $\Rightarrow$ $\frac{ e^{z_{mn}} }{ \sum_k e^{z_{mk} } }=p_{mn}$ and $\frac{ e^{z_{mi}} }{ \sum_k e^{z_{mk} } }=p_{mi}$ when we apply it, we will achieve

$\frac{\partial w_{ki}}{\partial w_{ij}} = \delta_{kj}$ only a direct verification can proof it

and for $\frac{\partial z_{mi}}{\partial w_{ij}}$ we achive :

Applying eqs.(4) (5) in : $\frac{\partial L}{\partial w_{ij}}=-\sum_m\sum_n \frac{y_{mn}}{p_{mn}}\frac{\partial p_{mn}}{\partial z_{mi}}\frac{\partial z_{mi}}{\partial w_{ij}}$ we have :

we can replace an index $n$ with $i$ (!There is no $\sum_i$)

(the sum of all probability) applying it

Implementation of Softmax using numpy

Regularization of softmax by learning rate and max iterations

Conclusion

Softmax Regression </h2>

author: daodeiv (David Stankov)

Abstract Softmaxt definition and how it works? Optimizaton of Softmax Loss with Gradient Descent (Deep math calculation) Implementation of Softmax using numpy Regularization of softmax by learning rate and max iterations Conclusion

Softmaxt definition and how it works?

Optimizaton of Softmax Loss with Gradient Descent (Deep math calculation)

using common Kronicker $\delta$ proprties

from $\frac{\partial z_{mk}}{\partial z_{mi}}=0$ if $k\ne i\;\frac{\partial z_{mk}}{\partial z_{mi}}=1\;ifk = i\;\Rightarrow \frac{\partial z_{mk}}{\partial z_{mi}}=\delta_{ki} $ plug in

from eq.(1) $\Rightarrow$ $\frac{ e^{z_{mn}} }{ \sum_k e^{z_{mk} } }=p_{mn}$ and $\frac{ e^{z_{mi}} }{ \sum_k e^{z_{mk} } }=p_{mi}$ when we apply it, we will achieve

$\frac{\partial w_{ki}}{\partial w_{ij}} = \delta_{kj}$ only a direct verification can proof it

and for $\frac{\partial z_{mi}}{\partial w_{ij}}$ we achive :

Applying eqs.(4) (5) in : $\frac{\partial L}{\partial w_{ij}}=-\sum_m\sum_n \frac{y_{mn}}{p_{mn}}\frac{\partial p_{mn}}{\partial z_{mi}}\frac{\partial z_{mi}}{\partial w_{ij}}$ we have :

we can replace an index $n$ with $i$ (!There is no $\sum_i$)

(the sum of all probability) applying it

Implementation of Softmax using numpy

Regularization of softmax by learning rate and max iterations

Conclusion

Abstract

Softmaxt definition and how it works?

Optimizaton of Softmax Loss with Gradient Descent (Deep math calculation)

Implementation of Softmax using numpy

Regularization of softmax by learning rate and max iterations

Conclusion