import warnings
import datetime

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import norm


from IPython.display import display, Markdown , Math 

sns.set()
warnings.filterwarnings('ignore')


def printmd(string): display(Markdown(string))
def latex(out): printmd(f'{out}')  
def pr(string): printmd('***{}***'.format(string))


df = pd.read_csv("../../../resources/data/golf_df.csv")
df


label = "Play"
yes = df[df[label] == "yes"].groupby("Outlook")[label].count()
no = df[df[label] == "no"].groupby("Outlook")[label].count()
likelihood_yes = yes/yes.sum()
likelihood_no = no/no.sum()


likelihood_yes.index = [(lambda i: f'P ( Outlook= "{i}"| Play="yes" ) = ')(i) for i in likelihood_yes.index] 
likelihood_yes

P ( Outlook= "overcast"| Play="yes" ) =     0.444444
P ( Outlook= "rainy"| Play="yes" ) =        0.333333
P ( Outlook= "sunny"| Play="yes" ) =        0.222222
Name: Play, dtype: float64


likelihood_no.index = [(lambda i: f'P ( Outlook= "{i}"| Play="no" ) = ')(i) for i in likelihood_no.index] 
likelihood_no

P ( Outlook= "rainy"| Play="no" ) =     0.4
P ( Outlook= "sunny"| Play="no" ) =     0.6
Name: Play, dtype: float64


def get_value_feature(df,feature):
    try:
        return df[feature]
    except:
        return 0


def create_likehood_tb(df, label):
    likehood_table = {}
    features = df.drop(label, axis=1).columns
    for feature in features:
        yes = df[df[label] == "yes"].groupby(feature)[label].count()
        no = df[df[label] == "no"].groupby(feature)[label].count()
        all = df.groupby(feature)[label].count()
        for feature_value in all.index:
            c = all[feature_value]
            c1 = get_value_feature(yes, feature_value)
            c2 = get_value_feature(no, feature_value)
            likehood_table[feature_value] = {
                'yes': c1 / yes.sum(),
                'no': c2 / no.sum(),
                'P': c / all.sum(),
            }
    return likehood_table


likehood_df = create_likehood_tb(df, "Play")


likehood_df = pd.DataFrame(likehood_df)
#['P(x | Play="yes")', 'P(x | Play="no")', 'P(x)']
likehood_df


c = df.groupby('Play').count().iloc[:, 0]
prior_probability  = c /c.sum()
prior_probability

Play
no     0.357143
yes    0.642857
Name: Outlook, dtype: float64


p =0 
def calculate_bayes(x,likelihood_tb, prior_probability):
    yes = prior_probability['yes']
    no = prior_probability['no']
    for index in x.index :
        value = x[index]
        yes = yes * likelihood_tb[value]['yes']
        no = no * likelihood_tb[value]['no']
    
    return "yes" if yes > no else "no"


test = df.drop("Play", axis=1)
predict = test.apply(calculate_bayes,likelihood_tb=likehood_df,prior_probability=prior_probability, axis=1)


predict

0      no
1      no
2     yes
3     yes
4     yes
5     yes
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
dtype: object


pr(accuracy_score(df["Play"],predict))


train = pd.read_csv("../../../resources/data/titanic/train.csv")
train['Sex'] = (train['Sex']=='male').astype(int)
train.head()


train.describe()


def get_stats(data, label):
    result = {}
    for i in train[label].unique():
        stats = train[train[label] == i].describe()  #[feature]
        result[i] = stats
    
    return result


label = 'Survived'
stats = get_stats(train, label = label)


def plot_probability(x, stats, feature, prior_probability=None):
    for label_value in stats:
        if prior_probability is not None:
            y = norm.pdf(x, stats[label_value][feature]['mean'], stats[label_value][feature]['std'])*prob[label_value]
            plt.plot(x,y,label=r'P(x|{}={}).p({}={})'.format(label,label_value,label,label_value))
            plt.title(f'posterior probility of {feature}') 
        else :
            y = norm.pdf(x, stats[label_value][feature]['mean'], stats[label_value][feature]['std'])
            plt.plot(x,y,label=r'P(x|{}={})'.format(label,label_value,label,label_value))
            plt.title(f'likelihood of {feature}')


prior_prob = train.groupby(by=label,axis=0)[label].count()/len(train[label]) 
prior_prob

Survived
0    0.616162
1    0.383838
Name: Survived, dtype: float64


x = np.linspace(0.5,3.5)
plot_probability(x, stats,feature='Pclass' )
plt.legend()

<matplotlib.legend.Legend at 0x8ccf5f5a00>


x = np.linspace(0.5,3.5)
plot_probability(x, stats,feature='Pclass',prior_probability=prior_prob )
plt.legend()

<matplotlib.legend.Legend at 0x8ccf6cedf0>


feature = 'Pclass'
x = np.linspace(-1,3)
plot_probability(x, stats,feature='Sex' )
plt.legend()

<matplotlib.legend.Legend at 0x8ccf4c2d60>


prob[label_value]

0.3838383838383838


x = np.linspace(-1,3)
plot_probability(x, stats, feature='Sex', prior_probability=prior_prob )
plt.legend()

<matplotlib.legend.Legend at 0x8ccf7266d0>


stats = get_stats(train, label = label)
def likelihood_table(x, label_stats ):
    result = {}
    for label_value in label_stats:
        stats_feature = label_stats[label_value][x.name]
        sigma = stats_feature['std']
        mu = stats_feature['mean']
        likelihood = norm.pdf(x, mu, sigma)
        result[label_value] = likelihood
        
    return result


result = train[['Pclass','Sex']].apply(likelihood_table, label_stats=stats)


not_survived = result['Pclass'][0]*result['Sex'][0]*prob[0]
survived = result['Pclass'][1]*result['Sex'][1]*prob[1]


predict = (survived>not_survived).astype(int)


accuracy_score(predict, train[label])

0.7867564534231201


from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train[['Pclass','Sex']],train['Survived'])
predict = gnb.predict(train[['Pclass','Sex']])
accuracy_score(predict, train[label])

0.7867564534231201

	Outlook	Temperature	Humidity	Windy	Play
0	sunny	hot	high	False	no
1	sunny	hot	high	True	no
2	overcast	hot	high	False	yes
3	rainy	mild	high	False	yes
4	rainy	cool	normal	False	yes
5	rainy	cool	normal	True	no
6	overcast	cool	normal	True	yes
7	sunny	mild	high	False	no
8	sunny	cool	normal	False	yes
9	rainy	mild	normal	False	yes
10	sunny	mild	normal	True	yes
11	overcast	mild	high	True	yes
12	overcast	hot	normal	False	yes
13	rainy	mild	high	True	no

	overcast	rainy	sunny	cool	hot	mild	high	normal	False	True
yes	0.444444	0.333333	0.222222	0.333333	0.222222	0.444444	0.333333	0.666667	0.666667	0.333333
no	0.000000	0.400000	0.600000	0.200000	0.400000	0.400000	0.800000	0.200000	0.400000	0.600000
P	0.285714	0.357143	0.357143	0.285714	0.285714	0.428571	0.500000	0.500000	0.571429	0.428571

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	1	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	0	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	1	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	0.647587	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	0.477990	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	0.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	1.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	1.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	1.000000	80.000000	8.000000	6.000000	512.329200

Naive Bayes Classifier

author: daodeiv (David Stankov)

Bayes Theorem

How does Binomial Naive Bayes work?

Investigation of likelihood and posterior probability throw features values of Titanic dataset

Implementation of likelihood table for Gaussian Naive Bayes and testing on Titanic

Bernoulli Naive Bayes

References

Bayes Theorem

How does Binomial Naive Bayes work? (implementation)

Gaussian Naive Bayes

Investigation of likelihood and posterior probability throw features values of Titanic dataset

Implementation of likelihood table for Gaussian Naive Bayes and testing on Titanic

Bernoulli Naive Bayes

References

Naive Bayes Classifier

author: daodeiv (David Stankov)

Bayes Theorem How does Binomial Naive Bayes work? Investigation of likelihood and posterior probability throw features values of Titanic dataset Implementation of likelihood table for Gaussian Naive Bayes and testing on Titanic Bernoulli Naive Bayes References

Bayes Theorem

How does Binomial Naive Bayes work? (implementation)

Gaussian Naive Bayes

Investigation of likelihood and posterior probability throw features values of Titanic dataset

Implementation of likelihood table for Gaussian Naive Bayes and testing on Titanic

Bernoulli Naive Bayes

References

Bayes Theorem

How does Binomial Naive Bayes work?

Investigation of likelihood and posterior probability throw features values of Titanic dataset

Implementation of likelihood table for Gaussian Naive Bayes and testing on Titanic

Bernoulli Naive Bayes

References