Coding Probability and Statistics With Python From Scratch
Coding Probability and Statistics With Python From Scratch
with Python
by Thom Ives, Ph.D.
What is the motivation for such an approach? The approach of coding math from scratch
without libraries or modules? I like the way my dear friend and brother Manjunatha
Gummaraju says it best.
"Hand crafting (without libraries & automation) helps to get a firm grip on
the subject, nuances & its applications. It also helps probably to author new
innovative techniques from the ground up."
return mu
return sigma
1 − ( )
2 σ
p = e
σ√2π
return p
return p
x = x_left + width / 2
while x < x_right:
X.append(x) # for plotting only
panel = PDF(x, mean, std_dev) * width # panel under PDF
CDF += panel # running sum of panels = integration
CDF_y.append(CDF) # for plotting only
x += width # current x value
1
α−1 β−1
B(α, β) = ∫ t (1 − t) dt
0
class Beta_Distribution:
def __init__(self, alpha, beta, panels=10000):
self.alpha = alpha
self.beta = beta
self.panels = panels
self.__Beta_Function__()
def __Beta_Function__(self):
width = 1 / self.panels
X = [x/self.panels for x in range(self.panels)]
# makes total integral of beta_PDF sum to 1
self.B = sum(
[(x**(self.alpha - 1) * \
(1 - x)**(self.beta - 1)) * width
for x in X])
def beta_PDF(self, x):
return x**(self.alpha - 1) * \
(1 - x)**(self.beta - 1) / self.B
Student's T-Distribution
1
α−1 β−1
B(α, β) = ∫ t (1 − t)
0
ν+1
−
2 2
1 t
P DFt (t) = (1 + )
1 ν
√νB( , ) ν
2 2
In [ ]: class T_Distribution:
def __init__(self, dof=9):
self.beta = self.beta_function(0.5, dof/2)
return beta
return f_of_t
return cdf
return p
mu = mean(X)
std = standard_deviation(X, mu=mu)
print(mu, std)
90.54 3.362796455332963
0.101
w = 1/1000
f_of_t = math.sin
T = [w/2]
S = [0]
C = [-1]
for t in range(10000):
T.append(T[-1] + w) # Our time step
S.append(f_of_t(t*w)) # Our Function
C.append(f_of_t(t*w)*w + C[-1]) # Integrating
plt.plot(T, S)
plt.plot(T, C)
plt.show()
In [ ]: import matplotlib
from matplotlib import rc
import matplotlib.pyplot as plt
%matplotlib inline
rc('text', usetex=True)
matplotlib.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}']
!apt install texlive-fonts-recommended texlive-fonts-extra cm-super dvipng
pw = 1/1000
X = [(x + 0.5)*pw for x in range(22000)]
P1 = [PDF(x, 8, 2) for x in X]
P2 = [PDF(x, 14, 2) for x in X]
C1 = [] # C2 = []
sum1 = 0 # sum2 = 0
for i in range(len(X)):
sum1 += P1[i]*pw # sum2 += P2[i]*pw
C1.append(sum1) # C2.append(sum2)
for sl in SigLevels:
for i in range(len(X)):
if C1[i] > sl:
sig_i = i
break
plt.figure(figsize = (10,5))
plt.plot(X, P1)
plt.plot(X, P2)
plt.title(
label="Distributions For Null And Alternative Hypotheses", fontsize=
plt.xlabel(xlabel="Values", fontsize=14)
plt.ylabel(ylabel="Probability of Occurance", fontsize=14)
plt.savefig(f'hypo_{round(sl, 3)}.png')
plt.show()
time.sleep(1)
plt.figure().clear()
pw = 1/1000
X = [(x + 0.5)*pw for x in range(22000)]
P1 = [PDF(x, 8, 2) for x in X]
C1 = [] # C2 = []
sum1 = 0
for i in range(len(X)):
sum1 += P1[i]*pw
C1.append(sum1)
sig_level = 0.975
for i in range(len(X)):
if C1[i] > sig_level:
sig_i = i
break
for i in range(19):
mu_alt = 12.4 + i * 0.2
mu_alt = round(mu_alt, 1)
plt.title(
label="Distributions For Null And Alternative Hypotheses", fontsize=
plt.xlabel(xlabel="Values", fontsize=14)
plt.ylabel(ylabel="Probability of Occurance", fontsize=14)
plt.xlim([0, 26])
plt.savefig(f'hypos_{round(mu_alt, 1)}.png')
plt.show()
time.sleep(1)
plt.figure().clear()
pw = 1/1000
X = [(x + 0.5)*pw for x in range(22000)]
P1 = [PDF(x, 8, 2) for x in X]
P2 = [PDF(x, 11, 2) for x in X]
C1 = []
C2 = []
sum1 = 0
sum2 = 0
for i in range(len(X)):
sum1 += P1[i]*pw
sum2 += P2[i]*pw
C1.append(sum1)
C2.append(sum2)
ROC_X = []
ROC_Y = []
pts = 41
Sig_Levels = [(v * 100/(pts-1))/100 for v in range(0, pts)]
if sig_lev == 0.999:
sig_lev = 1
TP_Rate = 1 - C2[sig_i]
FP_Rate = 1 - C1[sig_i]
ROC_X.append(FP_Rate)
ROC_Y.append(TP_Rate)
ax1.set_xlim([0, 21])
fig.savefig(f'hypo_{round(sig_lev, 3)}.png')
time.sleep(0.2)
pw = 1/1000
X = [(x + 0.5)*pw for x in range(22000)]
P1 = [PDF(x, 8, 2) for x in X]
C1 = []
sum1 = 0
for i in range(len(X)):
sum1 += P1[i]*pw
C1.append(sum1)
pts = 101
Sig_Levels = [(v * 100/(pts-1))/100 for v in range(0, pts)]
for i in range(71):
mu_alt = 8 + 0.1 * i
mu_alt = round(mu_alt, 1)
ROC_X = []
ROC_Y = []
for sig_lev in Sig_Levels:
if sig_lev == 1:
sig_lev = 0.999
for i in range(len(X)):
if C1[i] > sig_lev:
sig_i = i
break
if sig_lev == 0.999:
sig_lev = 1
TP_Rate = 1 - C2[sig_i]
FP_Rate = 1 - C1[sig_i]
ROC_X.append(FP_Rate)
ROC_Y.append(TP_Rate)
ax1.set_xlim([0, 21])
ax2.plot([0, 1], [0, 1])
ax2.plot(ROC_X, ROC_Y)
ax2.set_title(label="Receiver Operator Curve (ROC)")
ax2.set_xlabel(xlabel="False Positive Rate", fontsize=14)
ax2.set_ylabel(ylabel="True Positive Rate", fontsize=14)
ax2.set_xlim(0, 1)
ax2.set_ylim(0, 1)
plt.show()
fig.savefig(f'hypo_{round(mu_alt, 3)}.png')
# time.sleep(0.05)
pw = 1/1000
X = [(x + 0.5)*pw for x in range(22000)]
pts = 101
Sig_Levels = [(v * 100/(pts-1))/100 for v in range(0, pts)]
for i in range(26):
std_alt = 3.5 - 0.1 * i
std_alt = round(std_alt, 1)
if sig_lev == 0.999:
sig_lev = 1
TP_Rate = 1 - C2[sig_i]
FP_Rate = 1 - C1[sig_i]
ROC_X.append(FP_Rate)
ROC_Y.append(TP_Rate)
ax1.set_xlim([0, 21])
fig.savefig(f'hypo_{round(sig_lev, 3)}.png')
# time.sleep(0.05)
F1 Score
2
F1 =
1 1
+
Recall P recision
F1 = Harmonic Mean Of Recall and Precision
________________________________________________________
X, y = X, y = load_iris(return_X_y=True)
lr_mod = LR(penalty='l1', solver='liblinear')
lr_mod.fit(X, y)
print(lr_mod.coef_)
[[ 0. 2.52235623 -2.83220134 0. ]
[ 0.32846823 -1.79370624 0.66582088 -1.57267348]
[-2.62263278 -2.50833176 3.26131365 4.61826807]]
# Machine Learning
mod_LR = LinearRegression(fit_intercept=False, copy_X=True)
mod_LR.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
Cml = mod_LR.coef_[0, 0]
print(Cml)
# Visualize
plt.figure(figsize=(10, 5))
plt.plot(X, Cs*X+1) # + 1 separates the two exact plots
plt.plot(X, Cml*X)
plt.title('Models Determined By Stats And ML')
plt.legend(('Stats Way', 'ML Way'))
plt.show()
1.9918506427390517
1.9957689952415045
1.9992493711567196
1.001206029449972
In [ ]: # Visualize
plt.figure(figsize=(10, 5))
plt.scatter(X, Y, color='magenta')
plt.plot(X, Cs*X+b, color='black')
plt.ylim((0, 3.5))
plt.title('Models Determined By Stats And ML')
plt.show()
die_values = [1, 2, 3, 4, 5, 6]
sample_sizes = [2, 4, 8, 16, 32]
experiment_mean = np.mean(sample_means)
experiment_std = np.std(sample_means)
x_min = min(sample_means)
x_max = max(sample_means)
x = np.arange(x_min, x_max, 0.01)
y = norm.pdf(x, experiment_mean, experiment_std)
plt.plot(x, y)
cwd = os.getcwd()
if not os.path.isdir(f"{cwd}/images"):
os.mkdir(f"{cwd}/images")
no_images = True
image_num = 0
if no_images:
die_values = [1, 2, 3, 4, 5, 6]
sample_sizes = [2, 4, 8, 16, 32]
num_add_samples_list = [2] + [1]*8 + [2]*5 + [10]*8 + [100]*9
sample_means_D = {k: [] for k in sample_sizes}
total_samples = 0
experiment_mean = np.mean(sample_means_D[sample_size])
experiment_std = np.std(sample_means_D[sample_size])
x_min = min(sample_means_D[sample_size])
x_max = max(sample_means_D[sample_size])
x = np.arange(x_min, x_max, 0.001)
y = norm.pdf(x, experiment_mean, experiment_std)
plt.plot(x, y)
plt.close()
die_values = [1, 2, 3, 4, 5, 6]
die_roles = [np.random.choice(die_values, size=1)[0] for _ in range(int(1e6)
mean = round(np.mean(die_roles), 1)
print(f'Population mean is {mean}')
running_mean = round(np.mean(sample_means), 2)
if running_mean == 3.50:
break
cwd = os.getcwd()
if not os.path.isdir(f"{cwd}/images"):
os.mkdir(f"{cwd}/images")
no_images = True
image_num = 0
if no_images:
die_values = [1, 2, 3, 4, 5, 6]
num_add_samples_list = [2] + [1]*8 + [2]*5 + [10]*8 + [100]*9
num_add_samples_list += [9000] + [90000] + [900000] + [1000000]
sample_means = []
total_samples = 0
color = 'tab:blue'
ax1.set_xlabel('Sample Means')
ax1.set_ylabel('Occurence Rate', color=color)
bins = len(set(sample_means))
ax1.hist(sample_means, bins=bins, density=True, stacked=True)
ax1.set_xlim([2, 5])
ax1.set_ylim([0, 4])
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Probability', color=color)
ax2.tick_params(axis='y', labelcolor=color)
running_mean = round(np.mean(sample_means), 2)
title = f'Mean of sample means = {running_mean} '
title += f'for {total_samples} samples of size 32'
running_std = np.std(sample_means)
x = np.arange(2, 5, 0.001)
y = norm.pdf(x, running_mean, running_std)
ax2.plot(x, y, color=color)
ax2.set_ylim([0, 2])
plt.title(title)
plt.axvline(3.5)
fig.tight_layout()
# if total_samples == 2:
# for i in range(5):
# plt.savefig(f"{cwd}/images/{image_num:02d}.png")
# image_num += 1
# elif total_samples > 1000000:
# for i in range(5):
# plt.savefig(f"{cwd}/images/{image_num:02d}.png")
# image_num += 1
# else:
# plt.savefig(f"{cwd}/images/{image_num:02d}.png")
# image_num += 1
plt.show();
# plt.close()
color = 'tab:red'
ax1.set_xlabel('time (s)')
ax1.set_ylabel('exp', color=color)
ax1.plot(t, data1, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('sin', color=color) # we already handled the x-label with ax
ax2.plot(t, data2, color=color)
ax2.tick_params(axis='y', labelcolor=color)
Binomial Variables
NOTE: Use The 10% Rule For Approximate Independence Of Trials When Resampling Is
Not Possible
Binomial Combinametrics
n k n−k
p(k of n) = ( ) p (1 − p)
k
n n!
( ) =
k k!(n − k)!
n! k n−k
p(k of n) = p (1 − p)
k!(n − k)!
fact = math.factorial
Ps = 0.73
p_D = {}
n = 7
for k in range(n+1):
nCk = fact(n) / (fact(k) * fact(n-k))
p = nCk * Ps**k * (1 - Ps)**(n - k)
p_D[k] = round(p, 2)
plt.bar(p_D.keys(), p_D.values())
plt.show()