# HW3-3 Probability - Naguib

## 1904 days ago by bigdata2016

a naive bayesian model for binary classification using single feature

import matplotlib.pyplot as plt import numpy as np from sage.symbolic.integration.integral import definite_integral samples=500 def GaussPDF(x,mu,sigma): return (1/sqrt(2*pi*sigma**2))*exp(-(x-mu)**2/(2*sigma**2)) def Intersect2Gassians(mu1,sigma1,mu2,sigma2): gamma = 2*(sigma1**2)*(sigma2**2)*ln((1/sqrt(2*pi*sigma2**2))/(1/sqrt(2*pi*sigma1**2))) a=(sigma1**2)-(sigma2**2) b=2*((sigma2**2)*mu1-(sigma1**2)*mu2) c=(sigma1**2)*(mu2**2)-(sigma2**2)*(mu1**2)-gamma x1=(-b+sqrt((b**2)-4*a*c))/(2*a) x2=(-b-sqrt((b**2)-4*a*c))/(2*a) return [x1, x2] def errorFunction(x): t=1/(1+0.3275911*abs(1.0*x)) out=(1-exp(-x*x)*(0.254829592*t-0.284496736*t*t+1.421413741*t*t*t-1.453152027*t*t*t*t+1.061405429*t*t*t*t*t)) if x<=0: out=out*-1 return out def IntegrateGaussian(mu,sigma,from_,to_): I1=0.5*(1+errorFunction((from_-mu)/(sqrt(2.0)*sigma))) I2=0.5*(1+errorFunction((to_ -mu)/(sqrt(2.0)*sigma))) return I2-I1 A=matrix(RDF,samples,1) B=matrix(RDF,samples,1) TA = RealDistribution('gaussian', 2) TB = RealDistribution('gaussian', 1.4) pretty_print("Let's say we measured the feature of class A and class B", " ", samples, " times.") for i in range(samples): A[i]=TA.get_random_element()+5 B[i]=TB.get_random_element()-5 pA=plt.hist(A.transpose(), bins=50, range=(-15.,15.), color='r', alpha=0.3) pB=plt.hist(B.transpose(), bins=50, range=(-15.,15.), color='b', alpha=0.3) plt.legend(('A','B')) plt.title('histogram of samples') plt.savefig('Histogram.png') plt.close()
 $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|Let's|\phantom{\verb!x!}\verb|say|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|measured|\phantom{\verb!x!}\verb|the|\phantom{\verb!x!}\verb|feature|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|class|\phantom{\verb!x!}\verb|A|\phantom{\verb!x!}\verb|and|\phantom{\verb!x!}\verb|class|\phantom{\verb!x!}\verb|B| \phantom{\verb!x!} 500 \phantom{\verb!x!}\verb|times.|$
minA=min(A) ranA=max(A)-min(A) minB=min(B) ranB=max(B)-min(B) hist_A=matrix(RDF,int(samples/10)+1,1) hist_B=matrix(RDF,int(samples/10)+1,1) for i in range(int(samples/10)+1): hist_A[i]=0 hist_B[i]=0 for i in range(samples): idxA=int((A[i]-minA)*int(samples/10)/ranA) idxB=int((B[i]-minB)*int(samples/10)/ranB) hist_A[idxA]=hist_A[idxA,0]+1 hist_B[idxB]=hist_B[idxB,0]+1 A_2D=matrix(RDF,samples,2) B_2D=matrix(RDF,samples,2) for i in range(samples): idxA=int((A[i]-minA)*int(samples/10)/ranA) idxB=int((B[i]-minB)*int(samples/10)/ranB) A_2D[i,0]=A[i,0] A_2D[i,1]=hist_A[idxA,0] B_2D[i,0]=B[i,0] B_2D[i,1]=hist_B[idxB,0] var('sigma mu alpha x') model(x) = alpha * (1/sqrt(2*pi*sigma**2))*exp(-(x-mu)**2/(2*sigma**2)) show("Let's model their likelihood distribution with Gaussian pdf") A_model = find_fit(A_2D, model, initial_guess=[1,0,1], solution_dict = True) pretty_print("Estimated Gaussian pdf of A:", " ", mu, " =", " ", A_model[mu], ",", " ", sigma, " =", " ", A_model[sigma]) B_model = find_fit(B_2D, model, initial_guess=[1,0,1], solution_dict = True) pretty_print("Estimated Gaussian pdf of B:", " ", mu, " =", " ", B_model[mu], ",", " ", sigma, " =", " ", B_model[sigma]) gaussx=np.arange(-15, 15, 0.1); gaussAy=A_model[alpha]*2 * GaussPDF(gaussx, A_model[mu],A_model[sigma]) gaussBy=B_model[alpha]*2 * GaussPDF(gaussx, B_model[mu],B_model[sigma]) pA=plt.hist(A.transpose(), bins=50, range=(-15.,15.), color='r', alpha=0.1) pB=plt.hist(B.transpose(), bins=50, range=(-15.,15.), color='b', alpha=0.1) plt.plot(gaussx,gaussAy, color='r') plt.plot(gaussx,gaussBy, color='b') plt.legend(('A','B')) plt.title('Likelihood distribution by Gaussian PDF fitting') plt.savefig('fitting.png') plt.close()
 \newcommand{\Bold}[1]{\mathbf{#1}}\verb|Let's|\phantom{\verb!x!}\verb|model|\phantom{\verb!x!}\verb|their|\phantom{\verb!x!}\verb|likelihood|\phantom{\verb!x!}\verb|distribution|\phantom{\verb!x!}\verb|with|\phantom{\verb!x!}\verb|Gaussian|\phantom{\verb!x!}\verb|pdf| $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|Estimated|\phantom{\verb!x!}\verb|Gaussian|\phantom{\verb!x!}\verb|pdf|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|A:| \phantom{\verb!x!} \mu \phantom{\verb!x!}\verb|=| \phantom{\verb!x!} 5.0248229255 \verb|,| \phantom{\verb!x!} \sigma \phantom{\verb!x!}\verb|=| \phantom{\verb!x!} -2.06057480058$ $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|Estimated|\phantom{\verb!x!}\verb|Gaussian|\phantom{\verb!x!}\verb|pdf|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|B:| \phantom{\verb!x!} \mu \phantom{\verb!x!}\verb|=| \phantom{\verb!x!} -4.90008160372 \verb|,| \phantom{\verb!x!} \sigma \phantom{\verb!x!}\verb|=| \phantom{\verb!x!} 1.26629672512$
pretty_print("Now, let's assume we have a feature measurement", " ", x, " and we want to know which class (A or B) it came from") function('P') #var('A B') show("From Bayes Theory, we can derive:") show(LatexExpr(r"P(A|x)=\frac{P(x|A)P(A)}{P(x)}")) show(LatexExpr(r"=\frac{P(x|A)P(A)}{P(x|A)P(A)+P(x|B)P(B)}")) show(LatexExpr(r"=\frac{1}{1+\frac{P(x|B)P(B)}{P(x|A)P(A)}}")) pretty_print("Let's assume we don't have prior information of which class is more likely to appear, i.e. they are equiprobable", " ", LatexExpr(r"P(A)=P(B)")) show(LatexExpr(r"P(A|x)=\frac{1}{1+\frac{P(x|B)}{P(x|A)}}")) pretty_print("and similarly:") show(LatexExpr(r"P(B|x)=\frac{1}{1+\frac{P(x|A)}{P(x|B)}}")) pretty_print("this will generate the following decision characteristics") classAy=1/(1+ GaussPDF(gaussx,B_model[mu],B_model[sigma])/GaussPDF(gaussx,A_model[mu],A_model[sigma])) classBy=1/(1+ GaussPDF(gaussx,A_model[mu],A_model[sigma])/GaussPDF(gaussx,B_model[mu],B_model[sigma])) pA=plt.hist(A.transpose(), bins=50, color='r', alpha=0.1, normed=1, range=(-15.,15.)) pB=plt.hist(B.transpose(), bins=50, color='b', alpha=0.1, normed=1, range=(-15.,15.)) plt.plot(gaussx,classAy, color='r') plt.plot(gaussx,classBy, color='b') plt.legend(('A','B')) plt.title('Classification characteristics') plt.savefig('classification.png') plt.close()
 $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|Now,|\phantom{\verb!x!}\verb|let's|\phantom{\verb!x!}\verb|assume|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|have|\phantom{\verb!x!}\verb|a|\phantom{\verb!x!}\verb|feature|\phantom{\verb!x!}\verb|measurement| \phantom{\verb!x!} x \phantom{\verb!x!}\verb|and|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|want|\phantom{\verb!x!}\verb|to|\phantom{\verb!x!}\verb|know|\phantom{\verb!x!}\verb|which|\phantom{\verb!x!}\verb|class|\phantom{\verb!x!}\verb|(A|\phantom{\verb!x!}\verb|or|\phantom{\verb!x!}\verb|B)|\phantom{\verb!x!}\verb|it|\phantom{\verb!x!}\verb|came|\phantom{\verb!x!}\verb|from|$ \newcommand{\Bold}[1]{\mathbf{#1}}\verb|From|\phantom{\verb!x!}\verb|Bayes|\phantom{\verb!x!}\verb|Theory,|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|can|\phantom{\verb!x!}\verb|derive:| \newcommand{\Bold}[1]{\mathbf{#1}}P(A|x)=\frac{P(x|A)P(A)}{P(x)} \newcommand{\Bold}[1]{\mathbf{#1}}=\frac{P(x|A)P(A)}{P(x|A)P(A)+P(x|B)P(B)} \newcommand{\Bold}[1]{\mathbf{#1}}=\frac{1}{1+\frac{P(x|B)P(B)}{P(x|A)P(A)}} $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|Let's|\phantom{\verb!x!}\verb|assume|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|don't|\phantom{\verb!x!}\verb|have|\phantom{\verb!x!}\verb|prior|\phantom{\verb!x!}\verb|information|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|which|\phantom{\verb!x!}\verb|class|\phantom{\verb!x!}\verb|is|\phantom{\verb!x!}\verb|more|\phantom{\verb!x!}\verb|likely|\phantom{\verb!x!}\verb|to|\phantom{\verb!x!}\verb|appear,|\phantom{\verb!x!}\verb|i.e.|\phantom{\verb!x!}\verb|they|\phantom{\verb!x!}\verb|are|\phantom{\verb!x!}\verb|equiprobable| \phantom{\verb!x!} P(A)=P(B)$ \newcommand{\Bold}[1]{\mathbf{#1}}P(A|x)=\frac{1}{1+\frac{P(x|B)}{P(x|A)}} $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|and|\phantom{\verb!x!}\verb|similarly:|$ \newcommand{\Bold}[1]{\mathbf{#1}}P(B|x)=\frac{1}{1+\frac{P(x|A)}{P(x|B)}} $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|this|\phantom{\verb!x!}\verb|will|\phantom{\verb!x!}\verb|generate|\phantom{\verb!x!}\verb|the|\phantom{\verb!x!}\verb|following|\phantom{\verb!x!}\verb|decision|\phantom{\verb!x!}\verb|characteristics|$
pretty_print("we can compute the classification error as follows") pretty_print("Classification boundary (C)") show(LatexExpr(r"P(A|C)=P(B|C)")) show(LatexExpr(r"\frac{1}{1+\frac{P(C|A)}{P(C|B)}}=\frac{1}{1+\frac{P(C|B)}{P(C|A)}}")) show(LatexExpr(r"\frac{P(C|A)}{P(C|B)}=\frac{P(C|B)}{P(C|A)}")) show(LatexExpr(r"P(C|A)=P(C|A)")) show(LatexExpr(r"\frac{1}{\sqrt{2\pi\sigma_A^2}}e^{-\frac{(C-\mu_A)^2}{2\sigma_A^2}} = \frac{1}{\sqrt{2\pi\sigma_B^2}}e^{-\frac{(C-\mu_B)^2}{2\sigma_B^2}}")) C=Intersect2Gassians(A_model[mu],A_model[sigma],B_model[mu],B_model[sigma]) if abs(C[1])<abs(C[0]): C[0]=C[1] C=C[0] pretty_print("C =", " ", C) show(" ") ErrA = IntegrateGaussian(B_model[mu],B_model[sigma],C,99999) ErrB = IntegrateGaussian(A_model[mu],A_model[sigma],-99999,C) show(LatexExpr(r"P(A|B)=\int_{C}^{\infty} P(x|B) dx")) show(LatexExpr(r"P(A|B)=\int_{C}^{\infty} \frac{1}{\sqrt{2\pi\sigma_B^2}}e^{-\frac{(x-\mu_B)^2}{2\sigma_B^2}} dx")) pretty_print(LatexExpr(r"P(A|B)"), " =", " ", ErrA*100, "%") show(" ") pretty_print(LatexExpr(r"similarly, P(B|A)"), " =", " ", ErrB*100, "%") show(" ")
 $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|we|\phantom{\verb!x!}\verb|can|\phantom{\verb!x!}\verb|compute|\phantom{\verb!x!}\verb|the|\phantom{\verb!x!}\verb|classification|\phantom{\verb!x!}\verb|error|\phantom{\verb!x!}\verb|as|\phantom{\verb!x!}\verb|follows|$ $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|Classification|\phantom{\verb!x!}\verb|boundary|\phantom{\verb!x!}\verb|(C)|$ \newcommand{\Bold}[1]{\mathbf{#1}}P(A|C)=P(B|C) \newcommand{\Bold}[1]{\mathbf{#1}}\frac{1}{1+\frac{P(C|A)}{P(C|B)}}=\frac{1}{1+\frac{P(C|B)}{P(C|A)}} \newcommand{\Bold}[1]{\mathbf{#1}}\frac{P(C|A)}{P(C|B)}=\frac{P(C|B)}{P(C|A)} \newcommand{\Bold}[1]{\mathbf{#1}}P(C|A)=P(C|A) \newcommand{\Bold}[1]{\mathbf{#1}}\frac{1}{\sqrt{2\pi\sigma_A^2}}e^{-\frac{(C-\mu_A)^2}{2\sigma_A^2}} = \frac{1}{\sqrt{2\pi\sigma_B^2}}e^{-\frac{(C-\mu_B)^2}{2\sigma_B^2}} $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|C|\phantom{\verb!x!}\verb|=| \phantom{\verb!x!} -0.995216399987$ \newcommand{\Bold}[1]{\mathbf{#1}}\phantom{\verb!x!} \newcommand{\Bold}[1]{\mathbf{#1}}P(A|B)=\int_{C}^{\infty} P(x|B) dx \newcommand{\Bold}[1]{\mathbf{#1}}P(A|B)=\int_{C}^{\infty} \frac{1}{\sqrt{2\pi\sigma_B^2}}e^{-\frac{(x-\mu_B)^2}{2\sigma_B^2}} dx $\newcommand{\Bold}[1]{\mathbf{#1}}P(A|B) \phantom{\verb!x!}\verb|=| \phantom{\verb!x!} 0.102232467393359 \verb|%|$ \newcommand{\Bold}[1]{\mathbf{#1}}\phantom{\verb!x!} $\newcommand{\Bold}[1]{\mathbf{#1}}similarly, P(B|A) \phantom{\verb!x!}\verb|=| \phantom{\verb!x!} -0.174162907138120 \verb|%|$ \newcommand{\Bold}[1]{\mathbf{#1}}\phantom{\verb!x!}
pretty_print("we can notice that there's a small problem with classification of features with magnitude > 10.") pretty_print("it will be classified either as A or B depending on its side, in spite of not actually having any") pretty_print("samples in the training to support such decision. to fix this, we can add laplace smoothing, that is,") pretty_print("to assume that every possible outcome of an experiment happened at least once, before we actually start training") show(LatexExpr(r"P'(x|A)=P(x|A)+\epsilon")) show(LatexExpr(r"P'(x|B)=P(x|B)+\epsilon")) pretty_print("So, classification characteristics changes to be:") show(LatexExpr(r"P(A|x)=\frac{1}{1+\frac{P(x|B)+\epsilon}{P(x|A)+\epsilon}}")) show(LatexExpr(r"P(B|x)=\frac{1}{1+\frac{P(x|A)+\epsilon}{P(x|B)+\epsilon}}")) LclassAy=1/(1+ (0.001+GaussPDF(gaussx,B_model[mu],B_model[sigma]))/(0.001+GaussPDF(gaussx,A_model[mu],A_model[sigma]))) LclassBy=1/(1+ (0.001+GaussPDF(gaussx,A_model[mu],A_model[sigma]))/(0.001+GaussPDF(gaussx,B_model[mu],B_model[sigma]))) pA=plt.hist(A.transpose(), bins=50, color='r', alpha=0.1, normed=1, range=(-15.,15.)) pB=plt.hist(B.transpose(), bins=50, color='b', alpha=0.1, normed=1, range=(-15.,15.)) plt.plot(gaussx,LclassAy, color='r') plt.plot(gaussx,LclassBy, color='b') plt.legend(('A','B')) plt.title('Classification characteristics with e = 0.001') plt.savefig('laplace classification.png') plt.close()
 $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|we|\phantom{\verb!x!}\verb|can|\phantom{\verb!x!}\verb|notice|\phantom{\verb!x!}\verb|that|\phantom{\verb!x!}\verb|there's|\phantom{\verb!x!}\verb|a|\phantom{\verb!x!}\verb|small|\phantom{\verb!x!}\verb|problem|\phantom{\verb!x!}\verb|with|\phantom{\verb!x!}\verb|classification|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|features|\phantom{\verb!x!}\verb|with|\phantom{\verb!x!}\verb|magnitude|\phantom{\verb!x!}\verb|>|\phantom{\verb!x!}\verb|10.|$ $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|it|\phantom{\verb!x!}\verb|will|\phantom{\verb!x!}\verb|be|\phantom{\verb!x!}\verb|classified|\phantom{\verb!x!}\verb|either|\phantom{\verb!x!}\verb|as|\phantom{\verb!x!}\verb|A|\phantom{\verb!x!}\verb|or|\phantom{\verb!x!}\verb|B|\phantom{\verb!x!}\verb|depending|\phantom{\verb!x!}\verb|on|\phantom{\verb!x!}\verb|its|\phantom{\verb!x!}\verb|side,|\phantom{\verb!x!}\verb|in|\phantom{\verb!x!}\verb|spite|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|not|\phantom{\verb!x!}\verb|actually|\phantom{\verb!x!}\verb|having|\phantom{\verb!x!}\verb|any|$ $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|samples|\phantom{\verb!x!}\verb|in|\phantom{\verb!x!}\verb|the|\phantom{\verb!x!}\verb|training|\phantom{\verb!x!}\verb|to|\phantom{\verb!x!}\verb|support|\phantom{\verb!x!}\verb|such|\phantom{\verb!x!}\verb|decision.|\phantom{\verb!x!}\verb|to|\phantom{\verb!x!}\verb|fix|\phantom{\verb!x!}\verb|this,|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|can|\phantom{\verb!x!}\verb|add|\phantom{\verb!x!}\verb|laplace|\phantom{\verb!x!}\verb|smoothing,|\phantom{\verb!x!}\verb|that|\phantom{\verb!x!}\verb|is,|$ $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|to|\phantom{\verb!x!}\verb|assume|\phantom{\verb!x!}\verb|that|\phantom{\verb!x!}\verb|every|\phantom{\verb!x!}\verb|possible|\phantom{\verb!x!}\verb|outcome|\phantom{\verb!x!}\verb|of|\phantom{\verb!x!}\verb|an|\phantom{\verb!x!}\verb|experiment|\phantom{\verb!x!}\verb|happened|\phantom{\verb!x!}\verb|at|\phantom{\verb!x!}\verb|least|\phantom{\verb!x!}\verb|once,|\phantom{\verb!x!}\verb|before|\phantom{\verb!x!}\verb|we|\phantom{\verb!x!}\verb|actually|\phantom{\verb!x!}\verb|start|\phantom{\verb!x!}\verb|training|$ \newcommand{\Bold}[1]{\mathbf{#1}}P'(x|A)=P(x|A)+\epsilon \newcommand{\Bold}[1]{\mathbf{#1}}P'(x|B)=P(x|B)+\epsilon $\newcommand{\Bold}[1]{\mathbf{#1}}\verb|So,|\phantom{\verb!x!}\verb|classification|\phantom{\verb!x!}\verb|characteristics|\phantom{\verb!x!}\verb|changes|\phantom{\verb!x!}\verb|to|\phantom{\verb!x!}\verb|be:|$ \newcommand{\Bold}[1]{\mathbf{#1}}P(A|x)=\frac{1}{1+\frac{P(x|B)+\epsilon}{P(x|A)+\epsilon}} \newcommand{\Bold}[1]{\mathbf{#1}}P(B|x)=\frac{1}{1+\frac{P(x|A)+\epsilon}{P(x|B)+\epsilon}}