import pandas as pd
import numpy as np
import seaborn as sns
df=pd.read_excel('health_data_nulls.xlsx')
df1 = pd.read_excel('health_data_oob_values.xlsx')
df
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | |
---|---|---|---|---|---|---|---|
0 | 1 | 61.0 | 268300.0 | 41.0 | NaN | 3.0 | 62 |
1 | 2 | 55.0 | 122200.0 | 51.0 | 7.0 | 56.0 | 53 |
2 | 3 | 53.0 | 82100.0 | 37.0 | 0.0 | 55.0 | 42 |
3 | 4 | 30.0 | 101400.0 | 41.0 | 20.0 | 61.0 | 48 |
4 | 5 | 64.0 | 181100.0 | NaN | 0.0 | 70.0 | 81 |
... | ... | ... | ... | ... | ... | ... | ... |
995 | 996 | 50.0 | 141300.0 | 9.0 | 11.0 | 36.0 | 43 |
996 | 997 | 40.0 | 155700.0 | 0.0 | NaN | 3.0 | 29 |
997 | 998 | 36.0 | 84700.0 | 42.0 | 47.0 | 21.0 | 21 |
998 | 999 | 51.0 | 124500.0 | 63.0 | 40.0 | 1.0 | 25 |
999 | 1000 | 28.0 | 241200.0 | NaN | 90.0 | 0.0 | 42 |
1000 rows × 7 columns
df1.head(20)
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | |
---|---|---|---|---|---|---|---|
0 | 1 | 61 | 268300 | 41 | 999 | 3 | 62 |
1 | 2 | 55 | 122200 | 51 | 7 | 56 | 53 |
2 | 3 | 53 | 82100 | 37 | 0 | 55 | 42 |
3 | 4 | 30 | 101400 | 41 | 20 | 61 | 48 |
4 | 5 | 64 | 181100 | 999 | 0 | 70 | 81 |
5 | 6 | 45 | 156600 | 60 | 35 | 999 | 80 |
6 | 7 | 56 | 160400 | 55 | 999 | 59 | 63 |
7 | 8 | 999 | 78800 | 31 | 12 | 43 | 31 |
8 | 9 | 59 | 233500 | 25 | 15 | 33 | 66 |
9 | 10 | 44 | 50400 | 64 | 0 | 85 | 54 |
10 | 11 | 999 | 224400 | 69 | 21 | 55 | 78 |
11 | 12 | 42 | 175000 | 22 | 26 | 21 | 43 |
12 | 13 | 63 | 255900 | 46 | 32 | 24 | 68 |
13 | 14 | 30 | 70300 | 53 | 999 | 79 | 56 |
14 | 15 | 52 | 229500 | 74 | 56 | 69 | 85 |
15 | 16 | 54 | 188600 | 62 | 51 | 5 | 39 |
16 | 17 | 42 | 265000 | 41 | 88 | 0 | 47 |
17 | 18 | 36 | 65400 | 41 | 14 | 59 | 34 |
18 | 19 | 56 | 81200 | 64 | 0 | 71 | 53 |
19 | 20 | 33 | 67200 | 46 | 9 | 54 | 33 |
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1abfedb0688>
sns.pairplot(df1)
<seaborn.axisgrid.PairGrid at 0x1ab80f06548>
df1.Alcohol[df1.Alcohol==999] = np.nan
df1.head(15)
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | |
---|---|---|---|---|---|---|---|
0 | 1 | 61 | 268300 | 41.0 | 999 | 3 | 62 |
1 | 2 | 55 | 122200 | 51.0 | 7 | 56 | 53 |
2 | 3 | 53 | 82100 | 37.0 | 0 | 55 | 42 |
3 | 4 | 30 | 101400 | 41.0 | 20 | 61 | 48 |
4 | 5 | 64 | 181100 | NaN | 0 | 70 | 81 |
5 | 6 | 45 | 156600 | 60.0 | 35 | 999 | 80 |
6 | 7 | 56 | 160400 | 55.0 | 999 | 59 | 63 |
7 | 8 | 999 | 78800 | 31.0 | 12 | 43 | 31 |
8 | 9 | 59 | 233500 | 25.0 | 15 | 33 | 66 |
9 | 10 | 44 | 50400 | 64.0 | 0 | 85 | 54 |
10 | 11 | 999 | 224400 | 69.0 | 21 | 55 | 78 |
11 | 12 | 42 | 175000 | 22.0 | 26 | 21 | 43 |
12 | 13 | 63 | 255900 | 46.0 | 32 | 24 | 68 |
13 | 14 | 30 | 70300 | 53.0 | 999 | 79 | 56 |
14 | 15 | 52 | 229500 | 74.0 | 56 | 69 | 85 |
df['Age'].isnull().sum()
104
df['Income'].isnull().sum()
31
df['Age'].mean()
44.825892857142854
df['AgeImpMean']=df['Age'].fillna(df['Age'].mean())
df.head(15)
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | AgeImpMean | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 61.0 | 268300.0 | 41.0 | NaN | 3.0 | 62 | 61.000000 |
1 | 2 | 55.0 | 122200.0 | 51.0 | 7.0 | 56.0 | 53 | 55.000000 |
2 | 3 | 53.0 | 82100.0 | 37.0 | 0.0 | 55.0 | 42 | 53.000000 |
3 | 4 | 30.0 | 101400.0 | 41.0 | 20.0 | 61.0 | 48 | 30.000000 |
4 | 5 | 64.0 | 181100.0 | NaN | 0.0 | 70.0 | 81 | 64.000000 |
5 | 6 | 45.0 | 156600.0 | 60.0 | 35.0 | NaN | 80 | 45.000000 |
6 | 7 | 56.0 | 160400.0 | 55.0 | NaN | 59.0 | 63 | 56.000000 |
7 | 8 | NaN | 78800.0 | 31.0 | 12.0 | 43.0 | 31 | 44.825893 |
8 | 9 | 59.0 | 233500.0 | 25.0 | 15.0 | 33.0 | 66 | 59.000000 |
9 | 10 | 44.0 | 50400.0 | 64.0 | 0.0 | 85.0 | 54 | 44.000000 |
10 | 11 | NaN | 224400.0 | 69.0 | 21.0 | 55.0 | 78 | 44.825893 |
11 | 12 | 42.0 | 175000.0 | 22.0 | 26.0 | 21.0 | 43 | 42.000000 |
12 | 13 | 63.0 | 255900.0 | 46.0 | 32.0 | 24.0 | 68 | 63.000000 |
13 | 14 | 30.0 | 70300.0 | 53.0 | NaN | 79.0 | 56 | 30.000000 |
14 | 15 | 52.0 | 229500.0 | 74.0 | 56.0 | 69.0 | 85 | 52.000000 |
df['AgeImpMed']=df['Age'].fillna(df['Age'].median())
df['AgeImpMode']=df['Age'].fillna(df['Age'].mode())
df.head(15)
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | AgeImpMean | AgeImpMed | AgeImpMode | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 61.0 | 268300.0 | 41.0 | NaN | 3.0 | 62 | 61.000000 | 61.0 | 61.0 |
1 | 2 | 55.0 | 122200.0 | 51.0 | 7.0 | 56.0 | 53 | 55.000000 | 55.0 | 55.0 |
2 | 3 | 53.0 | 82100.0 | 37.0 | 0.0 | 55.0 | 42 | 53.000000 | 53.0 | 53.0 |
3 | 4 | 30.0 | 101400.0 | 41.0 | 20.0 | 61.0 | 48 | 30.000000 | 30.0 | 30.0 |
4 | 5 | 64.0 | 181100.0 | NaN | 0.0 | 70.0 | 81 | 64.000000 | 64.0 | 64.0 |
5 | 6 | 45.0 | 156600.0 | 60.0 | 35.0 | NaN | 80 | 45.000000 | 45.0 | 45.0 |
6 | 7 | 56.0 | 160400.0 | 55.0 | NaN | 59.0 | 63 | 56.000000 | 56.0 | 56.0 |
7 | 8 | NaN | 78800.0 | 31.0 | 12.0 | 43.0 | 31 | 44.825893 | 45.0 | NaN |
8 | 9 | 59.0 | 233500.0 | 25.0 | 15.0 | 33.0 | 66 | 59.000000 | 59.0 | 59.0 |
9 | 10 | 44.0 | 50400.0 | 64.0 | 0.0 | 85.0 | 54 | 44.000000 | 44.0 | 44.0 |
10 | 11 | NaN | 224400.0 | 69.0 | 21.0 | 55.0 | 78 | 44.825893 | 45.0 | NaN |
11 | 12 | 42.0 | 175000.0 | 22.0 | 26.0 | 21.0 | 43 | 42.000000 | 42.0 | 42.0 |
12 | 13 | 63.0 | 255900.0 | 46.0 | 32.0 | 24.0 | 68 | 63.000000 | 63.0 | 63.0 |
13 | 14 | 30.0 | 70300.0 | 53.0 | NaN | 79.0 | 56 | 30.000000 | 30.0 | 30.0 |
14 | 15 | 52.0 | 229500.0 | 74.0 | 56.0 | 69.0 | 85 | 52.000000 | 52.0 | 52.0 |
sns.regplot(x="Age", y="Blood Pressure", data=df)
<AxesSubplot:xlabel='Age', ylabel='Blood Pressure'>
sns.regplot(x="AgeImpMean", y="Blood Pressure", data=df)
<AxesSubplot:xlabel='AgeImpMean', ylabel='Blood Pressure'>
df['AgeNA']=df['Age'].isna()
df.head(15)
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | AgeImpMean | AgeImpMed | AgeImpMode | AgeNA | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 61.0 | 268300.0 | 41.0 | NaN | 3.0 | 62 | 61.000000 | 61.0 | 61.0 | False |
1 | 2 | 55.0 | 122200.0 | 51.0 | 7.0 | 56.0 | 53 | 55.000000 | 55.0 | 55.0 | False |
2 | 3 | 53.0 | 82100.0 | 37.0 | 0.0 | 55.0 | 42 | 53.000000 | 53.0 | 53.0 | False |
3 | 4 | 30.0 | 101400.0 | 41.0 | 20.0 | 61.0 | 48 | 30.000000 | 30.0 | 30.0 | False |
4 | 5 | 64.0 | 181100.0 | NaN | 0.0 | 70.0 | 81 | 64.000000 | 64.0 | 64.0 | False |
5 | 6 | 45.0 | 156600.0 | 60.0 | 35.0 | NaN | 80 | 45.000000 | 45.0 | 45.0 | False |
6 | 7 | 56.0 | 160400.0 | 55.0 | NaN | 59.0 | 63 | 56.000000 | 56.0 | 56.0 | False |
7 | 8 | NaN | 78800.0 | 31.0 | 12.0 | 43.0 | 31 | 44.825893 | 45.0 | NaN | True |
8 | 9 | 59.0 | 233500.0 | 25.0 | 15.0 | 33.0 | 66 | 59.000000 | 59.0 | 59.0 | False |
9 | 10 | 44.0 | 50400.0 | 64.0 | 0.0 | 85.0 | 54 | 44.000000 | 44.0 | 44.0 | False |
10 | 11 | NaN | 224400.0 | 69.0 | 21.0 | 55.0 | 78 | 44.825893 | 45.0 | NaN | True |
11 | 12 | 42.0 | 175000.0 | 22.0 | 26.0 | 21.0 | 43 | 42.000000 | 42.0 | 42.0 | False |
12 | 13 | 63.0 | 255900.0 | 46.0 | 32.0 | 24.0 | 68 | 63.000000 | 63.0 | 63.0 | False |
13 | 14 | 30.0 | 70300.0 | 53.0 | NaN | 79.0 | 56 | 30.000000 | 30.0 | 30.0 | False |
14 | 15 | 52.0 | 229500.0 | 74.0 | 56.0 | 69.0 | 85 | 52.000000 | 52.0 | 52.0 | False |
sns.lmplot(x="AgeImpMean", y="Blood Pressure", data=df, hue='AgeNA')
<seaborn.axisgrid.FacetGrid at 0x1abfecf0908>
from scipy import stats
corr, _ = stats.pearsonr(df['AgeImpMean'], df['Blood Pressure'])
corr
0.6449459380613165
import statsmodels.api as sm
correlation = df['AgeImpMean'].corr(df['Blood Pressure'])
correlation
0.6449459380613166
correlation = df['Age'].corr(df['Income'])
correlation
0.4989889602448255
df.dropna()
Person | Age | Income | Alcohol | Exercise | Smoke | Blood Pressure | AgeImpMean | AgeImpMed | AgeImpMode | AgeNA | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 55.0 | 122200.0 | 51.0 | 7.0 | 56.0 | 53 | 55.0 | 55.0 | 55.0 | False |
2 | 3 | 53.0 | 82100.0 | 37.0 | 0.0 | 55.0 | 42 | 53.0 | 53.0 | 53.0 | False |
3 | 4 | 30.0 | 101400.0 | 41.0 | 20.0 | 61.0 | 48 | 30.0 | 30.0 | 30.0 | False |
8 | 9 | 59.0 | 233500.0 | 25.0 | 15.0 | 33.0 | 66 | 59.0 | 59.0 | 59.0 | False |
9 | 10 | 44.0 | 50400.0 | 64.0 | 0.0 | 85.0 | 54 | 44.0 | 44.0 | 44.0 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
991 | 992 | 44.0 | 138300.0 | 23.0 | 27.0 | 12.0 | 31 | 44.0 | 44.0 | 44.0 | False |
994 | 995 | 30.0 | 63000.0 | 34.0 | 28.0 | 33.0 | 18 | 30.0 | 30.0 | 30.0 | False |
995 | 996 | 50.0 | 141300.0 | 9.0 | 11.0 | 36.0 | 43 | 50.0 | 50.0 | 50.0 | False |
997 | 998 | 36.0 | 84700.0 | 42.0 | 47.0 | 21.0 | 21 | 36.0 | 36.0 | 36.0 | False |
998 | 999 | 51.0 | 124500.0 | 63.0 | 40.0 | 1.0 | 25 | 51.0 | 51.0 | 51.0 | False |
638 rows × 11 columns
from statsmodels.formula.api import ols #or glm for logistic
df2=df
df2['BP']=df['Blood Pressure']
model_lm = ols(formula = 'BP ~ Age',
data = df2).fit()
print(model_lm.params)
Intercept -6.063997 Age 1.170684 dtype: float64
df2['Predicted']=df2['Age']*1.170684+(-6.063997)
df2['Predicted'].head()
0 65.347727 1 58.323623 2 55.982255 3 29.056523 4 68.859779 Name: Predicted, dtype: float64