HeenaR17

## z_score.py
##checking outliers
outliers=[]
def detect_outliers(data):
    threshold=3 #3rd std deviation
    mean=np.mean(data)
    std=np.std(data)
    for i in data:
        z_score=(i-mean)/std
        if np.abs(z_score)>threshold:
            outliers.append(i)

## IQR.py
#sort dataset
sorted(df)
quantile1, quantile3=np.percentile(df,[25,75])
IQR = quantile3-quantile1

lowerbound= quantile1 - (1.5 * IQR) #lower bound
upperbound= quantile3 + (1.5 * IQR)  #upper bound

## null.py
# Replacing missing values in Glucose column with median of respective class.
def impute_Glucose(cols):
    Glucose=cols[0]
    Outcome=cols[1]
    if pd.isnull(Glucose):
        if Outcome == 0:
           return 120
        else:
            return 142
    else:

## mean_normalization.py
# mean normalisation
from sklearn import preprocessing
scaled_3 = preprocessing.Normalizer().fit_transform(df1)
scaled_3

pd.DataFrame(scaled_3)

## minmax.py
#Min max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_2=scaler.fit_transform(df1)
scaled_2

pd.DataFrame(scaled_2)

## std.py
## Standardisation
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
scaled_df=sc.fit_transform(df1)
scaled_df

pd.DataFrame(scaled_df)

## correlation.py
#Correlation
#loading dataset
import seaborn as sns
df=pd.read_csv(r"C:\Users\heena\Downloads\diabetes1.csv")

df.shape
# Output: (768, 9)

df.corr()
#Output: 	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome

## paired.py
#Paired Ttest
#appending some values using normal distribution
marks=[10,20,30,22,33,44,15,50,46,25,45,25,33,12,45,34,31,30,30,30]
marks2=marks+stats.norm.rvs(scale=5,loc=-1.25,size=20)

ttest,p_value=stats.ttest_rel(a=marks, b=marks2)
if p_value >= 0.5:
  print("Accept null hypothesis")
else:
  print("Reject null hypothesis")

## 2sample.py
#Two sample ttest
classB=stats.poisson.rvs(loc=18,mu=33,size=60)
ttest,p_value=stats.ttest_ind(a=classA, b=classB)
if p_value >= 0.5:
  print("Accept null hypothesis")
else:
  print("Reject null hypothesis")
#Reject null hypothesis

## Ttest2.py
#Creating poisson distribution for ages of students in the school (population) and one particular class (sample)
import scipy.stats as stats
import pandas as pd
import math
school=stats.poisson.rvs(loc=18,mu=35,size=1500)
classA=stats.poisson.rvs(loc=18,mu=30,size=60)
ttest,p_value=ttest_1samp(classA,popmean=school.mean())

if p_value >= 0.5:
  print("Accept null hypothesis")
	##checking outliers
	outliers=[]
	def detect_outliers(data):
	threshold=3 #3rd std deviation
	mean=np.mean(data)
	std=np.std(data)
	for i in data:
	z_score=(i-mean)/std
	if np.abs(z_score)>threshold:
	outliers.append(i)
	#sort dataset
	sorted(df)
	quantile1, quantile3=np.percentile(df,[25,75])
	IQR = quantile3-quantile1

	lowerbound= quantile1 - (1.5 * IQR) #lower bound
	upperbound= quantile3 + (1.5 * IQR) #upper bound
	# Replacing missing values in Glucose column with median of respective class.
	def impute_Glucose(cols):
	Glucose=cols[0]
	Outcome=cols[1]
	if pd.isnull(Glucose):
	if Outcome == 0:
	return 120
	else:
	return 142
	else:
	# mean normalisation
	from sklearn import preprocessing
	scaled_3 = preprocessing.Normalizer().fit_transform(df1)
	scaled_3

	pd.DataFrame(scaled_3)
	#Min max scaling
	from sklearn.preprocessing import MinMaxScaler
	scaler = MinMaxScaler()
	scaled_2=scaler.fit_transform(df1)
	scaled_2

	pd.DataFrame(scaled_2)
	## Standardisation
	from sklearn.preprocessing import StandardScaler
	sc=StandardScaler()
	scaled_df=sc.fit_transform(df1)
	scaled_df

	pd.DataFrame(scaled_df)
	#Correlation
	#loading dataset
	import seaborn as sns
	df=pd.read_csv(r"C:\Users\heena\Downloads\diabetes1.csv")

	df.shape
	# Output: (768, 9)

	df.corr()
	#Output: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
	#Paired Ttest
	#appending some values using normal distribution
	marks=[10,20,30,22,33,44,15,50,46,25,45,25,33,12,45,34,31,30,30,30]
	marks2=marks+stats.norm.rvs(scale=5,loc=-1.25,size=20)

	ttest,p_value=stats.ttest_rel(a=marks, b=marks2)
	if p_value >= 0.5:
	print("Accept null hypothesis")
	else:
	print("Reject null hypothesis")
	#Two sample ttest
	classB=stats.poisson.rvs(loc=18,mu=33,size=60)
	ttest,p_value=stats.ttest_ind(a=classA, b=classB)
	if p_value >= 0.5:
	print("Accept null hypothesis")
	else:
	print("Reject null hypothesis")
	#Reject null hypothesis
	#Creating poisson distribution for ages of students in the school (population) and one particular class (sample)
	import scipy.stats as stats
	import pandas as pd
	import math
	school=stats.poisson.rvs(loc=18,mu=35,size=1500)
	classA=stats.poisson.rvs(loc=18,mu=30,size=60)
	ttest,p_value=ttest_1samp(classA,popmean=school.mean())

	if p_value >= 0.5:
	print("Accept null hypothesis")