Skip to content

Instantly share code, notes, and snippets.

##checking outliers
outliers=[]
def detect_outliers(data):
threshold=3 #3rd std deviation
mean=np.mean(data)
std=np.std(data)
for i in data:
z_score=(i-mean)/std
if np.abs(z_score)>threshold:
outliers.append(i)
#sort dataset
sorted(df)
quantile1, quantile3=np.percentile(df,[25,75])
IQR = quantile3-quantile1
lowerbound= quantile1 - (1.5 * IQR) #lower bound
upperbound= quantile3 + (1.5 * IQR) #upper bound
# Replacing missing values in Glucose column with median of respective class.
def impute_Glucose(cols):
Glucose=cols[0]
Outcome=cols[1]
if pd.isnull(Glucose):
if Outcome == 0:
return 120
else:
return 142
else:
# mean normalisation
from sklearn import preprocessing
scaled_3 = preprocessing.Normalizer().fit_transform(df1)
scaled_3
pd.DataFrame(scaled_3)
#Min max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_2=scaler.fit_transform(df1)
scaled_2
pd.DataFrame(scaled_2)
## Standardisation
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
scaled_df=sc.fit_transform(df1)
scaled_df
pd.DataFrame(scaled_df)
#Correlation
#loading dataset
import seaborn as sns
df=pd.read_csv(r"C:\Users\heena\Downloads\diabetes1.csv")
df.shape
# Output: (768, 9)
df.corr()
#Output: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
#Paired Ttest
#appending some values using normal distribution
marks=[10,20,30,22,33,44,15,50,46,25,45,25,33,12,45,34,31,30,30,30]
marks2=marks+stats.norm.rvs(scale=5,loc=-1.25,size=20)
ttest,p_value=stats.ttest_rel(a=marks, b=marks2)
if p_value >= 0.5:
print("Accept null hypothesis")
else:
print("Reject null hypothesis")
#Two sample ttest
classB=stats.poisson.rvs(loc=18,mu=33,size=60)
ttest,p_value=stats.ttest_ind(a=classA, b=classB)
if p_value >= 0.5:
print("Accept null hypothesis")
else:
print("Reject null hypothesis")
#Reject null hypothesis
#Creating poisson distribution for ages of students in the school (population) and one particular class (sample)
import scipy.stats as stats
import pandas as pd
import math
school=stats.poisson.rvs(loc=18,mu=35,size=1500)
classA=stats.poisson.rvs(loc=18,mu=30,size=60)
ttest,p_value=ttest_1samp(classA,popmean=school.mean())
if p_value >= 0.5:
print("Accept null hypothesis")