Kindly Note: If you had any programmatic error, please comment below.
LAB 1: Descriptive Statistics.
Question 1: Welcome to Statistics with Python | 1 | Descriptive Statistics
Solution 1:
#!/bin/python3
#Write your code here
import numpy as np
from scipy import stats
def compute_descriptive_stats(data):
sample = np.array(data)
'''
Compute the following statistical parameters, and retun them
'''
# Task 1:
# Calculate Mean value for the given parameter 'data'.
mean = np.mean(sample)
# Task 2:
# Calculate Median value for the given parameter 'data'.
median = np.median(sample)
# Task 3:
# Calculate Mode value for the given parameter 'data'.
mode = stats.mode(sample)[0]
# Task 4:
# Calcuate 25th and 75th percentile value for given parameter `data` and return as a numpy array.
percentile = np.percentile(sample, [25, 75], interpolation='lower')
# Task 5:
# Calcuate Inter quartile range value for given parameter `data`
iqr = stats.iqr(sample, interpolation='lower')
# Task 6:
# Calcuate Skewness value for given parameter `data`
skew = stats.skew(sample)
# Task 7
# Calcuate Kurtosis value for given parameter `data`
kutrosis = stats.kurtosis(sample)
"""
Returns
-------
mean : float
Mean value for the sample data `data`
median : float
Median value for the sample data `data`
mode : int
Mode value for the sample data `data`
percentile : list
25th and 75th percentile values for the sample data `data`
iqr : float
Inter quartile range value for the sample data `data`
skew : float
Skewness value for the sample data `data`
kutrosis : float
Kurtosis value for the sample data `data`
"""
return mean, median, mode, percentile, iqr, skew, kutrosis
LAB 2: Random Distributions.
Question 2: Welcome to Statistics with Python | 2 | Random Distributions.
Solution 2:
#!/bin/python3
#Write your code here
from scipy import stats
import numpy as np
def compute_absolute_difference(mean, std, seed):
#Task 1:
# Create a normal distribution with mean of `mean` and standard deviation of `std`.
normal_distribution = stats.norm(loc=mean, scale=std)
# Task 2:
# Set the random seed of `seed`, and create a random sample of 100 elements from the above defined distribution.
np.random.seed(seed)
random_sample = normal_distribution.rvs(100)
# Task 3:
# Compute the absolute difference between the sample mean and the distribution mean.
distribution_mean = np.mean(random_sample)
absolute_difference = distribution_mean - mean
"""
Parameters
----------
mean - float
mean value for the normal distribution
std - float
standard deviation value for the normal distribution
seed - int
seed valure for randomness
Returns
-------
absolute_difference : float
absolute difference between the sample mean and the distribution mean.
"""
return absolute_difference
LAB 3: Random Distribution 2
Question 3: Welcome to Statistics with Python | 3 | Random Experiment.
Solution 3:
#!/bin/python3
#Write your code here
import sys
import numpy as np
from scipy.stats import binom
def count_random_heads(number_sample, random_state):
# Simulate a random experiment of tossing a coin n times, and determine the count of Heads returned.
# Task 1: Use binom function from scipy.stats and Set the random state as `random_state'.
np.random.seed(random_state)
# Task 2: Draw a sample of `number_sample` elements from a defined distribution. Assume that the values '0' and '1' represent Heads and Tails respectively.
data_binom = binom.rvs(n=1,p=0.5,size=number_sample)
# Task 3: Count the number of 'Heads' and return it.
y = np.bincount(data_binom)
head_count = y[0]
"""
Parameters
----------
number_sample - int
number_sample represents that the number of times the experiments repeats
random_state - int
number_sample represents seed/state value for the randomness
Returns
-------
head_count : int
Count the number of 'Heads'
"""
return head_count
LAB 4: Exercise - Hypothesis testing.
Question 4: Welcome to Statistics with Python | 4 | Hypothesis Testing 1.
Solution 4:
#!/bin/python3
#Write your code here
from scipy import stats
def perform_ttest(sample1, sample2):
# Task 1:
# Consider two independent samples are passed as parameter to this.
# Compute t-statistic for the above two groups, and return the t-score and p value.
t_score, p_value = stats.ttest_ind(sample1, sample2)
"""
- The samples represent the life satisfaction score (computed through a methodology) of older adults and younger adults respectively.
- Hint: Use the ttest_ind function available in scipy.
Parameters
----------
sample1 - list
sample values of age taken from the group1
sample2 - list
sample values of age taken from the group2
Returns
-------
t_score : float
t-score of t-test
p_value: float
p-value of t-test
"""
return t_score, p_value
LAB 5: Exercise - Hypothesis testing.
Question 5: Welcome to Statistics with Python | 5 | Hypothesis Testing 2.
Solution 5:
#!/bin/python3
#Write your code here
from scipy import stats
def perform_ttest(sample1, sample2):
# Task 1:
# A researcher noted the number of chocolate chips consumed by 10 rats, with and without electrical stimulation.
# Compute t-statistic for the above samples, and return the t-score and p-value.
t_score, p_value = stats.ttest_rel(sample1, sample2)
"""
- The samples represent the number of chocolate chips consumed by 10 rats. `sample1` represents consumption with stimulation, and `sample2` without simulation.
- Hint: Use the ttest_rel function available in scipy.
Parameters
----------
sample1 - list
sample represents chocolate chips consumption with stimulation
sample2 - list
sample represents chocolate chips consumption without stimulation
Returns
-------
t_score : float
t-score of t-test
p_value: float
p-value of t-test
"""
return t_score, p_value
LAB 6: Linear Regression 1.
Question 6: Welcome to Statistics with Python | 6 | Linear Regression 1.
Solution 6:
#!/bin/python3
#Write your code here
# from collections.abc import KeysView
# from nltk.lm import models
import statsmodels.api as sm
# import statsmodels.formula.api as smf
import pandas as pd
# import numpy as np
# from statsmodels.stats import anova
def build_lr():
# Task 1: Load the R dataset mtcars and capture the data as a pandas dataframe.
#
mtcars_dataset = sm.datasets.get_rdataset("mtcars", "datasets")
mtcars_data = mtcars_dataset.data
df = pd.DataFrame(mtcars_data)
# Task 2: Build a linear regression model with independent variable `wt`, and dependent variable `mpg`.
# df = df[['mpg','wt']]
x = sm.add_constant(df['wt'])
y = df['mpg']
model = sm.OLS(y, x).fit()
# Task 3: Fit the model with data, and return the R-squared value as float.
r_squared= float(model.rsquared)
"""
Returns
-------
r_squared : float
r-squared value of the trained linear regression model
"""
return r_squared
LAB 7: Linear Regression 2.
Question 7: Welcome toStatistics with Python | 7 | Linear Regression 2.
Solution 7:
#!/bin/python3
#Write your code here
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
def build_lr():
# Task 1: Load the R dataset mtcars and capture the data as a pandas dataframe.
mtcars_dataset = sm.datasets.get_rdataset("mtcars", "datasets")
mtcars_data = mtcars_dataset.data
df = pd.DataFrame(mtcars_data)
# Task 2: Build a linear regression model with the log of independent variable `wt`, and log of dependent variable `mpg`.
x = 'wt'
y = 'mpg'
model = smf.ols(formula= f'np.log({y}) ~ np.log({x})', data=mtcars_data).fit()
# Task 3: Fit the model with data, and return the R-squared value as float.
r_squared= float(model.rsquared) # it will also work.
"""
Returns
-------
r_squared : float
r-squared value of the trained linear regression model
"""
return r_squared
LAB 8: Exercise - Logistic Regression.
Question 8: Welcome to Statistics with Python | 8 | Logistic Regression.
Solution 8:
#!/bin/python3
#Write your code here
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
def build_log_reg():
# Task 1: Load the R dataset biopsy from the MASS package and capture the data as a pandas dataframe.
biopsy_dataset = sm.datasets.get_rdataset("biopsy", "MASS")
biopsy_data = biopsy_dataset.data
df = pd.DataFrame(biopsy_data)
# Task 2: Rename the column name class to Class.
df = df.rename(columns={'class': 'Class'})
# Task 3: Transform the Class column values benign and malignant to '0' and '1' respectively.
df['Class'].replace(['benign','malignant'],[0,1] ,inplace=True)
# Task 4: Build a logistic regression model with independent variable 'V1' and dependent variable 'Class'.
model = smf.logit("Class ~ V1", data=df).fit()
# Task 5: Fit the model with data, and return the pseudo R-squared value as float.
r_squared= float(model.prsquared)
"""
Returns
-------
r_squared : float
r-squared value of the trained logistic regression model
"""
return r_squared
LAB 9: Exercise - Poisson Regression.
Question 9: Welcome to Statistics with Python | 9 | Poisson Regression.
Solution 9:
#!/bin/python3
#Write your code here
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
def build_pos_reg():
# Task 1: Load the R dataset biopsy from the MASS package and capture the data as a pandas dataframe.
Insurance_dataset = sm.datasets.get_rdataset("Insurance", "MASS")
Insurance_data = Insurance_dataset.data
df = pd.DataFrame(Insurance_data)
# Task 2: Build a Poisson regression model with a log of an independent variable `Holders`, and dependent variable `Claims`
Insurance_data['Holders_New'] = np.log(Insurance_data['Holders'])
poisson_model = smf.poisson('Claims ~ Holders_New', Insurance_data).fit()
# Task 3: Fit the model with data, and return the sum of the residuals as float.
residuals_sum= float(np.sum(poisson_model.resid) )
"""
Returns
-------
residuals_sum : float
sum of the residuals for the trained poission regression model
"""
return residuals_sum
LAB 10: Exercise - ANOVA 1.
Question 10: Welcome to Statistics with Python | 10 | ANOVA 1.
Solution 10:
#!/bin/python3
#Write your code here
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats import anova
import numpy as np
import pandas as pd
def build_anova():
# Task 1: Load the R dataset `mtcars`. and capture the data as a pandas dataframe.
mtcars_dataset = sm.datasets.get_rdataset("mtcars", "datasets")
mtcars_data = mtcars_dataset.data
df = pd.DataFrame(mtcars_data)
# Task 2: Build a linear regression model with independent variable `wt`, and dependent variable `mpg`
mtcars_model = smf.ols('mpg ~ wt', mtcars_data).fit()
# Task 3: Fit the model with data, and perform ANOVA on the linear model.(Hint:Use anova.anova_lm)
f1_score= float(anova.anova_lm(mtcars_model).F["wt"])
"""
- Return the F-statistic value as float.
Returns
-------
f1_score : float
F-statistic value of the ANOVA model
"""
return f1_score
LAB 11: Exercise - ANOVA 2
Question 11: Welcome to Statistics with Python | 11 | ANOVA 2.
Solution 11:
#!/bin/python3
#Write your code here
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats import anova
import numpy as np
import pandas as pd
def build_anova():
# Task 1: Load the R dataset `mtcars`. and capture the data as a pandas dataframe.
mtcars_dataset = sm.datasets.get_rdataset("mtcars", "datasets")
mtcars_data = mtcars_dataset.data
df = pd.DataFrame(mtcars_data)
# Task 2: Build a linear regression model by considering the `log` of independent variable `wt`, and log of dependent variable mpg.
model= smf.ols(formula='np.log(mpg) ~ np.log(wt)', data=df).fit()
# Task 3: Fit the model with data, and perform ANOVA on the linear model. (Hint:Use anova.anova_lm)
f1_score= float(anova.anova_lm(model).F["np.log(wt)"])
"""
- Return the F-statistic value as a float
Returns
-------
f1_score : float
F-statistic value of the ANOVA model
"""
return f1_score