from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as seas
import scipy as spy
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve


stress_df = pd.read_csv('/content/drive/MyDrive/CSV/mental_health_finaldata_1.csv')
print(stress_df)

          Age  Gender Occupation        Days_Indoors Growing_Stress  \
0       20-25  Female  Corporate           1-14 days            Yes   
1    30-Above    Male     Others          31-60 days            Yes   
2    30-Above  Female    Student    Go out Every day             No   
3       25-30    Male     Others           1-14 days            Yes   
4       16-20  Female    Student  More than 2 months            Yes   
..        ...     ...        ...                 ...            ...   
819     20-25    Male  Corporate    Go out Every day             No   
820     20-25    Male     Others           1-14 days            Yes   
821     20-25    Male    Student  More than 2 months            Yes   
822     16-20    Male   Business          15-30 days             No   
823  30-Above  Female     Others          15-30 days             No   

    Quarantine_Frustrations Changes_Habits Mental_Health_History  \
0                       Yes             No                   Yes   
1                       Yes          Maybe                    No   
2                        No            Yes                    No   
3                        No          Maybe                    No   
4                       Yes            Yes                    No   
..                      ...            ...                   ...   
819                     Yes             No                   Yes   
820                     Yes             No                   Yes   
821                   Maybe          Maybe                    No   
822                      No          Maybe                    No   
823                      No             No                    No   

    Weight_Change Mood_Swings Coping_Struggles Work_Interest Social_Weakness  
0             Yes      Medium               No            No             Yes  
1              No        High               No            No             Yes  
2              No      Medium              Yes         Maybe              No  
3           Maybe      Medium               No         Maybe             Yes  
4             Yes      Medium              Yes         Maybe              No  
..            ...         ...              ...           ...             ...  
819           Yes      Medium               No           Yes           Maybe  
820         Maybe         Low               No         Maybe           Maybe  
821           Yes        High              Yes           Yes           Maybe  
822         Maybe         Low              Yes            No           Maybe  
823           Yes         Low              Yes            No           Maybe  

[824 rows x 13 columns]


stress_table2 = pd.crosstab(stress_df['Growing_Stress'], stress_df['Age'])
statistic, p_value = f_oneway(stress_table2['16-20'], stress_table2['20-25'], stress_table2['25-30'], stress_table2['30-Above'])
print(stress_table2)
print("P-Value:", p_value)

Age             16-20  20-25  25-30  30-Above
Growing_Stress                               
Maybe              60     63     66        78
No                 75     47     65        69
Yes                76     76     74        75
P-Value: 0.4821995988536243


stress_table3 = pd.crosstab(stress_df['Age'], stress_df['Growing_Stress'])
descriptive_stats = stress_table3.describe()
print(stress_table3)
print(descriptive_stats)

Growing_Stress  Maybe  No  Yes
Age                           
16-20              60  75   76
20-25              63  47   76
25-30              66  65   74
30-Above           78  69   75
Growing_Stress      Maybe         No        Yes
count            4.000000   4.000000   4.000000
mean            66.750000  64.000000  75.250000
std              7.889867  12.055428   0.957427
min             60.000000  47.000000  74.000000
25%             62.250000  60.500000  74.750000
50%             64.500000  67.000000  75.500000
75%             69.000000  70.500000  76.000000
max             78.000000  75.000000  76.000000


stress_table2.plot(kind='bar', colormap='Paired')
plt.ylabel('Count')

Text(0, 0.5, 'Count')


plt.figure(figsize=(8, 6))
seas.violinplot(x='Growing_Stress', y='Age', data=stress_df, inner='quartile')
plt.ylabel('Growing Stress')
plt.xlabel('Age Group')
plt.title(f'Age Groups vs Growing Stress (Contingency Table)')
plt.grid(True)
plt.show()


alcohol_df = pd.read_csv('/content/drive/MyDrive/CSV/alcohol_consumption.csv')
print(alcohol_df)

    REF_DATE                             GEO  DGUID                 Age group  \
0       2015  Canada (excluding territories)    NaN  Total, 12 years and over   
1       2016  Canada (excluding territories)    NaN  Total, 12 years and over   
2       2017  Canada (excluding territories)    NaN  Total, 12 years and over   
3       2018  Canada (excluding territories)    NaN  Total, 12 years and over   
4       2019  Canada (excluding territories)    NaN  Total, 12 years and over   
..       ...                             ...    ...                       ...   
91      2018  Canada (excluding territories)    NaN         65 years and over   
92      2019  Canada (excluding territories)    NaN         65 years and over   
93      2020  Canada (excluding territories)    NaN         65 years and over   
94      2021  Canada (excluding territories)    NaN         65 years and over   
95      2022  Canada (excluding territories)    NaN         65 years and over   

           Sex      Indicators    Characteristics      UOM  UOM_ID  \
0   Both sexes  Heavy drinking  Number of persons   Number     223   
1   Both sexes  Heavy drinking  Number of persons   Number     223   
2   Both sexes  Heavy drinking  Number of persons   Number     223   
3   Both sexes  Heavy drinking  Number of persons   Number     223   
4   Both sexes  Heavy drinking  Number of persons   Number     223   
..         ...             ...                ...      ...     ...   
91  Both sexes  Heavy drinking            Percent  Percent     239   
92  Both sexes  Heavy drinking            Percent  Percent     239   
93  Both sexes  Heavy drinking            Percent  Percent     239   
94  Both sexes  Heavy drinking            Percent  Percent     239   
95  Both sexes  Heavy drinking            Percent  Percent     239   

   SCALAR_FACTOR  SCALAR_ID      VECTOR  COORDINATE      VALUE STATUS  SYMBOL  \
0          units          0  v110787655  1.1.1.17.1  5782800.0    NaN     NaN   
1          units          0  v110787655  1.1.1.17.1  5770900.0    NaN     NaN   
2          units          0  v110787655  1.1.1.17.1  6015500.0    NaN     NaN   
3          units          0  v110787655  1.1.1.17.1  5946400.0    NaN     NaN   
4          units          0  v110787655  1.1.1.17.1  5802200.0    NaN     NaN   
..           ...        ...         ...         ...        ...    ...     ...   
91         units          0  v110790388  1.6.1.17.4        7.4    NaN     NaN   
92         units          0  v110790388  1.6.1.17.4        7.6    NaN     NaN   
93         units          0  v110790388  1.6.1.17.4        7.4    NaN     NaN   
94         units          0  v110790388  1.6.1.17.4        7.9    NaN     NaN   
95         units          0  v110790388  1.6.1.17.4       10.0    NaN     NaN   

    TERMINATED  DECIMALS  
0          NaN         0  
1          NaN         0  
2          NaN         0  
3          NaN         0  
4          NaN         0  
..         ...       ...  
91         NaN         1  
92         NaN         1  
93         NaN         1  
94         NaN         1  
95         NaN         1  

[96 rows x 18 columns]


alcohol_df['SCALAR_ID'].unique() # this output shows us that the entire column has the same value

array([0])


# Since the SCALAR_ID column doesn't provide information for the dataset, we can remove it
alcohol_df = alcohol_df.drop(columns=['SCALAR_ID'])

# Since the STATUS, SYMBOL, AND TERMINATED columns are completely empty, we can remove them as well
alcohol_df = alcohol_df.drop(columns=['STATUS', 'SYMBOL', 'TERMINATED'])

# Since the UOM_ID (unit of measure ID), DGUID , SCALAR_FACTOR, VECTOR, DECIMALS and COORDINATE
# columns don't provide data necessary to our analysis, we will remove them as well
alcohol_df = alcohol_df.drop(columns=['UOM_ID', 'DGUID', 'SCALAR_FACTOR', 'VECTOR', 'DECIMALS','COORDINATE'])


print(alcohol_df)

    REF_DATE                             GEO                 Age group  \
0       2015  Canada (excluding territories)  Total, 12 years and over   
1       2016  Canada (excluding territories)  Total, 12 years and over   
2       2017  Canada (excluding territories)  Total, 12 years and over   
3       2018  Canada (excluding territories)  Total, 12 years and over   
4       2019  Canada (excluding territories)  Total, 12 years and over   
..       ...                             ...                       ...   
91      2018  Canada (excluding territories)         65 years and over   
92      2019  Canada (excluding territories)         65 years and over   
93      2020  Canada (excluding territories)         65 years and over   
94      2021  Canada (excluding territories)         65 years and over   
95      2022  Canada (excluding territories)         65 years and over   

           Sex      Indicators    Characteristics      UOM      VALUE  
0   Both sexes  Heavy drinking  Number of persons   Number  5782800.0  
1   Both sexes  Heavy drinking  Number of persons   Number  5770900.0  
2   Both sexes  Heavy drinking  Number of persons   Number  6015500.0  
3   Both sexes  Heavy drinking  Number of persons   Number  5946400.0  
4   Both sexes  Heavy drinking  Number of persons   Number  5802200.0  
..         ...             ...                ...      ...        ...  
91  Both sexes  Heavy drinking            Percent  Percent        7.4  
92  Both sexes  Heavy drinking            Percent  Percent        7.6  
93  Both sexes  Heavy drinking            Percent  Percent        7.4  
94  Both sexes  Heavy drinking            Percent  Percent        7.9  
95  Both sexes  Heavy drinking            Percent  Percent       10.0  

[96 rows x 8 columns]


# Upon further examination of the data, we notice that the Characteristics and UOM columns provide similar data
print(alcohol_df['Characteristics'].unique())
print(alcohol_df['UOM'].unique())

# Given that UOM is more concise, we will keep UOM and drop Characteristics
alcohol_df = alcohol_df.drop(columns=['Characteristics'])
print(alcohol_df)

['Number of persons' 'Percent']
['Number' 'Percent']
    REF_DATE                             GEO                 Age group  \
0       2015  Canada (excluding territories)  Total, 12 years and over   
1       2016  Canada (excluding territories)  Total, 12 years and over   
2       2017  Canada (excluding territories)  Total, 12 years and over   
3       2018  Canada (excluding territories)  Total, 12 years and over   
4       2019  Canada (excluding territories)  Total, 12 years and over   
..       ...                             ...                       ...   
91      2018  Canada (excluding territories)         65 years and over   
92      2019  Canada (excluding territories)         65 years and over   
93      2020  Canada (excluding territories)         65 years and over   
94      2021  Canada (excluding territories)         65 years and over   
95      2022  Canada (excluding territories)         65 years and over   

           Sex      Indicators      UOM      VALUE  
0   Both sexes  Heavy drinking   Number  5782800.0  
1   Both sexes  Heavy drinking   Number  5770900.0  
2   Both sexes  Heavy drinking   Number  6015500.0  
3   Both sexes  Heavy drinking   Number  5946400.0  
4   Both sexes  Heavy drinking   Number  5802200.0  
..         ...             ...      ...        ...  
91  Both sexes  Heavy drinking  Percent        7.4  
92  Both sexes  Heavy drinking  Percent        7.6  
93  Both sexes  Heavy drinking  Percent        7.4  
94  Both sexes  Heavy drinking  Percent        7.9  
95  Both sexes  Heavy drinking  Percent       10.0  

[96 rows x 7 columns]


# Finally, we will check for duplicates
duplicated = alcohol_df[alcohol_df.duplicated()]
num_of_duplicates_alc = alcohol_df.duplicated().sum()

print(num_of_duplicates_alc)

0


alcohol_df_percents = alcohol_df.copy()
alcohol_df_percents = alcohol_df_percents[alcohol_df_percents['UOM'] == 'Percent']

# To improve the graph, remove the rows with the age group as 'Total'
alcohol_df_percents = alcohol_df_percents[alcohol_df_percents['Age group'] != 'Total, 12 years and over']


df_pivot = alcohol_df_percents.pivot(index='REF_DATE', columns='Age group', values='VALUE')

for column in df_pivot.columns:
    plt.plot(df_pivot.index, df_pivot[column], marker='o', label=column)

plt.title('Percent of Age Group Reporting Heavy Drinking, Over Years')
plt.xlabel('Year')
plt.ylabel('Percent Reporting Heavy Drinking')
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.grid(True)
plt.show()


# To begin, we will create arrays storing the numeric percents associated with each age group
age_12_17 = alcohol_df_percents[alcohol_df_percents['Age group'] == '12 to 17 years']['VALUE']
age_18_34 = alcohol_df_percents[alcohol_df_percents['Age group'] == '18 to 34 years']['VALUE']
age_35_49 = alcohol_df_percents[alcohol_df_percents['Age group'] == '35 to 49 years']['VALUE']
age_50_64 = alcohol_df_percents[alcohol_df_percents['Age group'] == '50 to 64 years']['VALUE']
age_65_above = alcohol_df_percents[alcohol_df_percents['Age group'] == '65 years and over']['VALUE']

results = f_oneway(age_12_17,age_18_34,age_35_49, age_50_64, age_65_above)
print(results.pvalue)

6.293707068743887e-24


smoking_df = pd.read_csv('/content/drive/MyDrive/CSV/smoking_stroke.csv')
print(smoking_df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  formerly smoked   
1     Self-employed          Rural             202.21   NaN     never smoked   
2           Private          Rural             105.92  32.5     never smoked   
3           Private          Urban             171.23  34.4           smokes   
4     Self-employed          Rural             174.12  24.0     never smoked   
...             ...            ...                ...   ...              ...   
5105        Private          Urban              83.75   NaN     never smoked   
5106  Self-employed          Urban             125.20  40.0     never smoked   
5107  Self-employed          Rural              82.99  30.6     never smoked   
5108        Private          Rural             166.29  25.6  formerly smoked   
5109       Govt_job          Urban              85.28  26.2          Unknown   

      stroke  
0          1  
1          1  
2          1  
3          1  
4          1  
...      ...  
5105       0  
5106       0  
5107       0  
5108       0  
5109       0  

[5110 rows x 12 columns]


# 1. Check for exact duplicates -> None Existing
print(smoking_df.shape)
smoking_df.drop_duplicates()
print(smoking_df.shape, "\n")

# 2. Check for missing values -> BMI category is not used for this exploration thus can be ignored
missing_values = smoking_df.isnull().sum()
print(missing_values)

(5110, 12)
(5110, 12) 

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


print(smoking_df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  formerly smoked   
1     Self-employed          Rural             202.21   NaN     never smoked   
2           Private          Rural             105.92  32.5     never smoked   
3           Private          Urban             171.23  34.4           smokes   
4     Self-employed          Rural             174.12  24.0     never smoked   
...             ...            ...                ...   ...              ...   
5105        Private          Urban              83.75   NaN     never smoked   
5106  Self-employed          Urban             125.20  40.0     never smoked   
5107  Self-employed          Rural              82.99  30.6     never smoked   
5108        Private          Rural             166.29  25.6  formerly smoked   
5109       Govt_job          Urban              85.28  26.2          Unknown   

      stroke  
0          1  
1          1  
2          1  
3          1  
4          1  
...      ...  
5105       0  
5106       0  
5107       0  
5108       0  
5109       0  

[5110 rows x 12 columns]


# 0. Explore the columns and rows of our filtered data
print("Num of Rows and Columns : " , smoking_df.shape)

# 1. Explore the columns of the data
print("\nColumns : ", smoking_df.columns)

# 2. Explore the data types of the columns -> age : float64 smoking_status : object (string)
print("\nData types :\n" , smoking_df.dtypes )

# 3. Explore the categories of smoking_status
print("\nCategories of smoking status: ", smoking_df["smoking_status"].unique())

# 4. Check for unknown response for smoking status -> more than 25% refused to respond MNAR
unknown_smoking_status = smoking_df[smoking_df["smoking_status"] == "Unknown"]
unknown_smoking_status
print("\nMissing Num of Rows and Columns :", unknown_smoking_status.shape)

Num of Rows and Columns :  (5110, 12)

Columns :  Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

Data types :
 id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

Categories of smoking status:  ['formerly smoked' 'never smoked' 'smokes' 'Unknown']

Missing Num of Rows and Columns : (1544, 12)


# Convert the smoking status to numeric values
def convert_status (status) :
  if status == "never smoked" :
    return 0
  elif status == "formerly smoked" :
    return 1
  elif status == "smokes" :
    return 2
  elif status == "Unknown" :
    return -1

smoking_df["smoking_status"] = smoking_df["smoking_status"].apply(convert_status)
print(smoking_df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi  smoking_status  \
0           Private          Urban             228.69  36.6               1   
1     Self-employed          Rural             202.21   NaN               0   
2           Private          Rural             105.92  32.5               0   
3           Private          Urban             171.23  34.4               2   
4     Self-employed          Rural             174.12  24.0               0   
...             ...            ...                ...   ...             ...   
5105        Private          Urban              83.75   NaN               0   
5106  Self-employed          Urban             125.20  40.0               0   
5107  Self-employed          Rural              82.99  30.6               0   
5108        Private          Rural             166.29  25.6               1   
5109       Govt_job          Urban              85.28  26.2              -1   

      stroke  
0          1  
1          1  
2          1  
3          1  
4          1  
...      ...  
5105       0  
5106       0  
5107       0  
5108       0  
5109       0  

[5110 rows x 12 columns]


# Create and display a contingency table used for our testing
smoking_stroke_table2 = pd.crosstab(smoking_df['stroke'], smoking_df['smoking_status'])
print(smoking_stroke_table2)

# ANOVA Testing to find P-Value
result = f_oneway(smoking_stroke_table2[0],smoking_stroke_table2[1], smoking_stroke_table2[2], smoking_stroke_table2[-1])
print("P-Value: ", result.pvalue)

smoking_status    -1     0    1    2
stroke                              
0               1497  1802  815  747
1                 47    90   70   42
P-Value:  0.9018883799637181


contingency = pd.crosstab(smoking_df['smoking_status'], smoking_df['stroke'])
contingency.plot(kind= "bar")

<Axes: xlabel='smoking_status'>


plt.figure(figsize=(8, 6))
seas.violinplot(x='smoking_status', y='stroke', data=smoking_df, inner='quartile')
plt.ylabel('Stroke')
plt.xlabel('Encoded Smoking Status (-1: Unknown, 0: never smoked, 1: formerly smoked, 2: smokes)')
plt.title(f'Smoking Status and Stroke')
plt.grid(True)
plt.show()


glucose_df = pd.read_csv('/content/drive/MyDrive/CSV/brain_stroke.csv')
print(glucose_df)

      gender   age  hypertension  heart_disease ever_married      work_type  \
0       Male  67.0             0              1          Yes        Private   
1       Male  80.0             0              1          Yes        Private   
2     Female  49.0             0              0          Yes        Private   
3     Female  79.0             1              0          Yes  Self-employed   
4       Male  81.0             0              0          Yes        Private   
...      ...   ...           ...            ...          ...            ...   
4976    Male  41.0             0              0           No        Private   
4977    Male  40.0             0              0          Yes        Private   
4978  Female  45.0             1              0          Yes       Govt_job   
4979    Male  40.0             0              0          Yes        Private   
4980  Female  80.0             1              0          Yes        Private   

     Residence_type  avg_glucose_level   bmi   smoking_status  stroke  
0             Urban             228.69  36.6  formerly smoked       1  
1             Rural             105.92  32.5     never smoked       1  
2             Urban             171.23  34.4           smokes       1  
3             Rural             174.12  24.0     never smoked       1  
4             Urban             186.21  29.0  formerly smoked       1  
...             ...                ...   ...              ...     ...  
4976          Rural              70.15  29.8  formerly smoked       0  
4977          Urban             191.15  31.1           smokes       0  
4978          Rural              95.02  31.8           smokes       0  
4979          Rural              83.94  30.0           smokes       0  
4980          Urban              83.75  29.1     never smoked       0  

[4981 rows x 11 columns]


# Calculate the Pearson correlation coefficient (r) between "avg_glucose_level" and "stroke"
correlation_coefficient = glucose_df['avg_glucose_level'].corr(glucose_df['stroke'])

# Create a plot to visualize the relationship between "avg_glucose_level" and "stroke"
import seaborn as seas
plt.figure(figsize=(8, 6))
seas.violinplot(x='stroke', y='avg_glucose_level', data=glucose_df, inner='quartile')
#plt.scatter(glucose_df['avg_glucose_level'], glucose_df['stroke'], alpha=0.5)
plt.ylabel('Average Glucose Level')
plt.xlabel('Stroke Status')
plt.title(f'Average Glucose Level vs Stroke (Correlation Coefficient: {correlation_coefficient:.2f})')
plt.grid(True)
plt.show()

print("Pearson Correlation Coefficient (r):" , correlation_coefficient)

Pearson Correlation Coefficient (r): 0.13322732663313727


stroke_df = pd.read_csv('/content/drive/MyDrive/CSV/stroke_prediction_dataset.csv')
print(stroke_df)

       Patient ID       Patient Name  Age  Gender  Hypertension  \
0           18153    Mamooty Khurana   56    Male             0   
1           62749  Kaira Subramaniam   80    Male             0   
2           32145      Dhanush Balan   26    Male             1   
3            6154        Ivana Baral   73    Male             0   
4           48973  Darshit Jayaraman   51    Male             1   
...           ...                ...  ...     ...           ...   
14995       13981          Keya Iyer   88  Female             1   
14996       87707       Anahita Virk   47  Female             0   
14997       33174         Ivana Kaur   35    Male             0   
14998       22343        Anvi Mannan   73    Male             0   
14999       11066      Gokul Trivedi   64  Female             0   

       Heart Disease Marital Status       Work Type Residence Type  \
0                  1        Married   Self-employed          Rural   
1                  0         Single   Self-employed          Urban   
2                  1        Married    Never Worked          Rural   
3                  0        Married    Never Worked          Urban   
4                  1       Divorced   Self-employed          Urban   
...              ...            ...             ...            ...   
14995              1       Divorced   Self-employed          Urban   
14996              0        Married         Private          Urban   
14997              0        Married  Government Job          Rural   
14998              0         Single   Self-employed          Urban   
14999              0         Single    Never Worked          Urban   

       Average Glucose Level  ...    Alcohol Intake Physical Activity  \
0                     130.91  ...    Social Drinker          Moderate   
1                     183.73  ...             Never               Low   
2                     189.00  ...            Rarely              High   
3                     185.29  ...  Frequent Drinker          Moderate   
4                     177.34  ...            Rarely               Low   
...                      ...  ...               ...               ...   
14995                 160.22  ...    Social Drinker              High   
14996                 107.58  ...             Never               Low   
14997                 134.90  ...            Rarely              High   
14998                 169.42  ...             Never              High   
14999                 186.88  ...            Rarely          Moderate   

      Stroke History Family History of Stroke  Dietary Habits Stress Levels  \
0                  0                      Yes           Vegan          3.48   
1                  0                       No           Paleo          1.73   
2                  0                      Yes           Paleo          7.31   
3                  0                       No           Paleo          5.35   
4                  0                      Yes     Pescatarian          6.84   
...              ...                      ...             ...           ...   
14995              0                       No           Paleo          1.12   
14996              1                       No     Gluten-Free          1.47   
14997              1                       No           Paleo          0.51   
14998              0                      Yes           Paleo          1.53   
14999              0                       No           Vegan          4.57   

      Blood Pressure Levels  Cholesterol Levels  \
0                   140/108   HDL: 68, LDL: 133   
1                    146/91    HDL: 63, LDL: 70   
2                    154/97    HDL: 59, LDL: 95   
3                    174/81   HDL: 70, LDL: 137   
4                    121/95    HDL: 65, LDL: 68   
...                     ...                 ...   
14995                171/92   HDL: 44, LDL: 153   
14996                155/71   HDL: 35, LDL: 183   
14997               121/110   HDL: 57, LDL: 159   
14998                157/74    HDL: 79, LDL: 91   
14999                133/81   HDL: 78, LDL: 179   

                                                Symptoms  Diagnosis  
0                          Difficulty Speaking, Headache     Stroke  
1        Loss of Balance, Headache, Dizziness, Confusion     Stroke  
2                                    Seizures, Dizziness     Stroke  
3      Seizures, Blurred Vision, Severe Fatigue, Head...  No Stroke  
4                                    Difficulty Speaking     Stroke  
...                                                  ...        ...  
14995                                                NaN  No Stroke  
14996                                Difficulty Speaking  No Stroke  
14997      Difficulty Speaking, Severe Fatigue, Headache     Stroke  
14998  Severe Fatigue, Numbness, Confusion, Dizziness...  No Stroke  
14999                                           Headache     Stroke  

[15000 rows x 22 columns]


young_adults = stroke_df[(stroke_df['Age']>=18)&(stroke_df['Age']<=30)]
print(young_adults)

       Patient ID     Patient Name  Age  Gender  Hypertension  Heart Disease  \
2           32145    Dhanush Balan   26    Male             1              1   
12          66924     Ahana  Lalla   30  Female             0              1   
19          23954     Taran Khatri   25    Male             0              0   
25          36975      Jhanvi Brar   24  Female             0              0   
37          94512       Anvi Salvi   23  Female             0              0   
...           ...              ...  ...     ...           ...            ...   
14972       11839    Chirag Kurian   30    Male             0              1   
14974       30150  Alisha Banerjee   20  Female             0              0   
14981       12323        Pari Ravi   25    Male             0              0   
14983       40381        Sana Goel   18  Female             0              0   
14991       90658      Samaira Raj   26    Male             0              1   

      Marital Status       Work Type Residence Type  Average Glucose Level  \
2            Married    Never Worked          Rural                 189.00   
12          Divorced  Government Job          Urban                 163.15   
19           Married         Private          Urban                  71.38   
25           Married   Self-employed          Urban                  79.89   
37            Single  Government Job          Rural                 164.72   
...              ...             ...            ...                    ...   
14972        Married   Self-employed          Rural                 126.94   
14974       Divorced    Never Worked          Rural                 101.36   
14981         Single         Private          Rural                  77.64   
14983         Single   Self-employed          Urban                  68.26   
14991        Married   Self-employed          Rural                 145.05   

       ...    Alcohol Intake Physical Activity Stroke History  \
2      ...            Rarely              High              0   
12     ...  Frequent Drinker          Moderate              0   
19     ...            Rarely          Moderate              0   
25     ...    Social Drinker              High              1   
37     ...    Social Drinker               Low              1   
...    ...               ...               ...            ...   
14972  ...            Rarely          Moderate              0   
14974  ...             Never              High              0   
14981  ...  Frequent Drinker               Low              0   
14983  ...    Social Drinker          Moderate              1   
14991  ...    Social Drinker               Low              1   

      Family History of Stroke  Dietary Habits Stress Levels  \
2                          Yes           Paleo          7.31   
12                         Yes  Non-Vegetarian          9.19   
19                         Yes     Gluten-Free          0.46   
25                          No      Vegetarian          6.48   
37                         Yes     Gluten-Free          7.86   
...                        ...             ...           ...   
14972                       No     Pescatarian          9.51   
14974                      Yes     Pescatarian          2.26   
14981                      Yes           Paleo          2.69   
14983                       No      Vegetarian          6.79   
14991                       No     Pescatarian          0.71   

      Blood Pressure Levels  Cholesterol Levels  \
2                    154/97    HDL: 59, LDL: 95   
12                   114/67    HDL: 80, LDL: 83   
19                   170/64   HDL: 72, LDL: 174   
25                   151/65   HDL: 73, LDL: 111   
37                   148/74    HDL: 30, LDL: 62   
...                     ...                 ...   
14972                113/65   HDL: 55, LDL: 179   
14974                159/94    HDL: 42, LDL: 99   
14981                135/66   HDL: 58, LDL: 161   
14983                136/66   HDL: 59, LDL: 172   
14991               180/110    HDL: 33, LDL: 99   

                                                Symptoms  Diagnosis  
2                                    Seizures, Dizziness     Stroke  
12                             Loss of Balance, Numbness     Stroke  
19                                              Seizures     Stroke  
25     Numbness, Loss of Balance, Numbness, Blurred V...     Stroke  
37                    Blurred Vision, Seizures, Weakness     Stroke  
...                                                  ...        ...  
14972                                                NaN  No Stroke  
14974                           Seizures, Severe Fatigue     Stroke  
14981  Blurred Vision, Headache, Severe Fatigue, Loss...     Stroke  
14983  Severe Fatigue, Severe Fatigue, Headache, Seiz...     Stroke  
14991                               Confusion, Confusion  No Stroke  

[2662 rows x 22 columns]


duplicate_rows = young_adults[young_adults.duplicated()]
num_duplicates = young_adults.duplicated().sum()

print(duplicate_rows)

Empty DataFrame
Columns: [Patient ID, Patient Name, Age, Gender, Hypertension, Heart Disease, Marital Status, Work Type, Residence Type, Average Glucose Level, Body Mass Index (BMI), Smoking Status, Alcohol Intake, Physical Activity, Stroke History, Family History of Stroke, Dietary Habits, Stress Levels, Blood Pressure Levels, Cholesterol Levels, Symptoms, Diagnosis]
Index: []

[0 rows x 22 columns]


# Function to calculate Z-scores
def z_score(df, threshold=3.5):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
    outliers = (z_scores > threshold).any(axis=1)
    return outliers

# Detect outliers
outliers = z_score(young_adults)

# Show outliers
print("Outliers detected:")
print(young_adults[outliers])

Outliers detected:
Empty DataFrame
Columns: [Patient ID, Patient Name, Age, Gender, Hypertension, Heart Disease, Marital Status, Work Type, Residence Type, Average Glucose Level, Body Mass Index (BMI), Smoking Status, Alcohol Intake, Physical Activity, Stroke History, Family History of Stroke, Dietary Habits, Stress Levels, Blood Pressure Levels, Cholesterol Levels, Symptoms, Diagnosis]
Index: []

[0 rows x 22 columns]


conditions = [
    (young_adults['Stress Levels'] >= 0.00) & (young_adults['Stress Levels'] <= 1.00),
    (young_adults['Stress Levels'] >= 1.01) & (young_adults['Stress Levels'] <= 2.00),
    (young_adults['Stress Levels'] >= 2.01) & (young_adults['Stress Levels'] <= 3.00),
    (young_adults['Stress Levels'] >= 3.01) & (young_adults['Stress Levels'] <= 4.00),
    (young_adults['Stress Levels'] >= 4.01) & (young_adults['Stress Levels'] <= 5.00),
    (young_adults['Stress Levels'] >= 5.01) & (young_adults['Stress Levels'] <= 6.00),
    (young_adults['Stress Levels'] >= 6.01) & (young_adults['Stress Levels'] <= 7.00),
    (young_adults['Stress Levels'] >= 7.01) & (young_adults['Stress Levels'] <= 8.00),
    (young_adults['Stress Levels'] >= 8.01) & (young_adults['Stress Levels'] <= 9.00),
    (young_adults['Stress Levels'] >= 9.01) & (young_adults['Stress Levels'] <= 10.00)
]
# create a list of the values we want to assign for each condition
values = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']

# create a new column and use np.select to assign values to it using our lists as arguments
young_adults['New Stress Levels'] = np.select(conditions, values)
print(young_adults)

       Patient ID     Patient Name  Age  Gender  Hypertension  Heart Disease  \
2           32145    Dhanush Balan   26    Male             1              1   
12          66924     Ahana  Lalla   30  Female             0              1   
19          23954     Taran Khatri   25    Male             0              0   
25          36975      Jhanvi Brar   24  Female             0              0   
37          94512       Anvi Salvi   23  Female             0              0   
...           ...              ...  ...     ...           ...            ...   
14972       11839    Chirag Kurian   30    Male             0              1   
14974       30150  Alisha Banerjee   20  Female             0              0   
14981       12323        Pari Ravi   25    Male             0              0   
14983       40381        Sana Goel   18  Female             0              0   
14991       90658      Samaira Raj   26    Male             0              1   

      Marital Status       Work Type Residence Type  Average Glucose Level  \
2            Married    Never Worked          Rural                 189.00   
12          Divorced  Government Job          Urban                 163.15   
19           Married         Private          Urban                  71.38   
25           Married   Self-employed          Urban                  79.89   
37            Single  Government Job          Rural                 164.72   
...              ...             ...            ...                    ...   
14972        Married   Self-employed          Rural                 126.94   
14974       Divorced    Never Worked          Rural                 101.36   
14981         Single         Private          Rural                  77.64   
14983         Single   Self-employed          Urban                  68.26   
14991        Married   Self-employed          Rural                 145.05   

       ...  Physical Activity Stroke History Family History of Stroke  \
2      ...               High              0                      Yes   
12     ...           Moderate              0                      Yes   
19     ...           Moderate              0                      Yes   
25     ...               High              1                       No   
37     ...                Low              1                      Yes   
...    ...                ...            ...                      ...   
14972  ...           Moderate              0                       No   
14974  ...               High              0                      Yes   
14981  ...                Low              0                      Yes   
14983  ...           Moderate              1                       No   
14991  ...                Low              1                       No   

       Dietary Habits  Stress Levels Blood Pressure Levels Cholesterol Levels  \
2               Paleo           7.31                154/97   HDL: 59, LDL: 95   
12     Non-Vegetarian           9.19                114/67   HDL: 80, LDL: 83   
19        Gluten-Free           0.46                170/64  HDL: 72, LDL: 174   
25         Vegetarian           6.48                151/65  HDL: 73, LDL: 111   
37        Gluten-Free           7.86                148/74   HDL: 30, LDL: 62   
...               ...            ...                   ...                ...   
14972     Pescatarian           9.51                113/65  HDL: 55, LDL: 179   
14974     Pescatarian           2.26                159/94   HDL: 42, LDL: 99   
14981           Paleo           2.69                135/66  HDL: 58, LDL: 161   
14983      Vegetarian           6.79                136/66  HDL: 59, LDL: 172   
14991     Pescatarian           0.71               180/110   HDL: 33, LDL: 99   

                                                Symptoms  Diagnosis  \
2                                    Seizures, Dizziness     Stroke   
12                             Loss of Balance, Numbness     Stroke   
19                                              Seizures     Stroke   
25     Numbness, Loss of Balance, Numbness, Blurred V...     Stroke   
37                    Blurred Vision, Seizures, Weakness     Stroke   
...                                                  ...        ...   
14972                                                NaN  No Stroke   
14974                           Seizures, Severe Fatigue     Stroke   
14981  Blurred Vision, Headache, Severe Fatigue, Loss...     Stroke   
14983  Severe Fatigue, Severe Fatigue, Headache, Seiz...     Stroke   
14991                               Confusion, Confusion  No Stroke   

      New Stress Levels  
2                   7-8  
12                 9-10  
19                  0-1  
25                  6-7  
37                  7-8  
...                 ...  
14972              9-10  
14974               2-3  
14981               2-3  
14983               6-7  
14991               0-1  

[2662 rows x 23 columns]

<ipython-input-1683-f09478bf7528>:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  young_adults['New Stress Levels'] = np.select(conditions, values)


stress_table = pd.crosstab(young_adults['New Stress Levels'], young_adults['Diagnosis'])
print(stress_table)

Diagnosis          No Stroke  Stroke
New Stress Levels                   
0-1                      148     131
1-2                      111     155
2-3                      130     135
3-4                      126     124
4-5                      139     112
5-6                      143     138
6-7                      136     127
7-8                      153     148
8-9                      117     141
9-10                     129     119


stress_table.plot(kind='bar', colormap='Paired')
plt.ylabel('Count')

Text(0, 0.5, 'Count')


chi2, p_value, dof, expected = chi2_contingency(stress_table)
print("P-Value:", p_value)

P-Value: 0.10741461278569753


plt.figure(figsize=(8, 6))
seas.violinplot(x='Diagnosis', y='New Stress Levels', data=young_adults, inner='quartile')
plt.ylabel('New Stress Levels')
plt.xlabel('Stroke Diagnosis')
plt.title(f'Stress Levels vs Stroke (Contingency Table)')
plt.grid(True)
plt.show()


alcohol_level_table = pd.crosstab(young_adults['Alcohol Intake'], young_adults['Diagnosis'])
print(alcohol_level_table)

Diagnosis         No Stroke  Stroke
Alcohol Intake                     
Frequent Drinker        338     324
Never                   336     296
Rarely                  328     366
Social Drinker          330     344


alcohol_level_table.plot(kind='bar')
plt.title('Alcohol Intake vs Stroke Diagnosis')
plt.legend(title='Diagnosis', bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.ylabel('Count')

Text(0, 0.5, 'Count')


# Conduct the chi2 test on the contingency table
chi2, p_value, dof, expected = chi2_contingency(alcohol_level_table)
print("P-Value for Alcohol Consumption vs Diagnosis:", p_value)

P-Value for Alcohol Consumption vs Diagnosis: 0.15787917256302525


# Clean the columns for accurate result
young_adults["Smoking Status"].str.strip()
young_adults["Diagnosis"].str.strip()

# Contingency Table
smoking_stroke_table = pd.crosstab(young_adults["Smoking Status"],young_adults["Diagnosis"])
print(smoking_stroke_table)

Diagnosis         No Stroke  Stroke
Smoking Status                     
Currently Smokes        449     466
Formerly Smoked         415     408
Non-smoker              468     456


# Data Comparison - Bar Graph (adequate for comparison between discrete values)
smoking_stroke_table.plot(kind = "bar")

<Axes: xlabel='Smoking Status'>


# P-Value
result = chi2_contingency(smoking_stroke_table)
print(result.pvalue)

0.7673106445186333


plt.figure(figsize=(8, 6))
seas.violinplot(x='Diagnosis', y='Smoking Status', data=young_adults, inner='quartile')
plt.ylabel('Smoking Status')
plt.xlabel('Stroke Diagnosis')
plt.title(f'Smoking Status impacting Stroke Diagnosis')
plt.grid(True)
plt.show()


print(young_adults)

       Patient ID     Patient Name  Age  Gender  Hypertension  Heart Disease  \
2           32145    Dhanush Balan   26    Male             1              1   
12          66924     Ahana  Lalla   30  Female             0              1   
19          23954     Taran Khatri   25    Male             0              0   
25          36975      Jhanvi Brar   24  Female             0              0   
37          94512       Anvi Salvi   23  Female             0              0   
...           ...              ...  ...     ...           ...            ...   
14972       11839    Chirag Kurian   30    Male             0              1   
14974       30150  Alisha Banerjee   20  Female             0              0   
14981       12323        Pari Ravi   25    Male             0              0   
14983       40381        Sana Goel   18  Female             0              0   
14991       90658      Samaira Raj   26    Male             0              1   

      Marital Status       Work Type Residence Type  Average Glucose Level  \
2            Married    Never Worked          Rural                 189.00   
12          Divorced  Government Job          Urban                 163.15   
19           Married         Private          Urban                  71.38   
25           Married   Self-employed          Urban                  79.89   
37            Single  Government Job          Rural                 164.72   
...              ...             ...            ...                    ...   
14972        Married   Self-employed          Rural                 126.94   
14974       Divorced    Never Worked          Rural                 101.36   
14981         Single         Private          Rural                  77.64   
14983         Single   Self-employed          Urban                  68.26   
14991        Married   Self-employed          Rural                 145.05   

       ...  Physical Activity Stroke History Family History of Stroke  \
2      ...               High              0                      Yes   
12     ...           Moderate              0                      Yes   
19     ...           Moderate              0                      Yes   
25     ...               High              1                       No   
37     ...                Low              1                      Yes   
...    ...                ...            ...                      ...   
14972  ...           Moderate              0                       No   
14974  ...               High              0                      Yes   
14981  ...                Low              0                      Yes   
14983  ...           Moderate              1                       No   
14991  ...                Low              1                       No   

       Dietary Habits  Stress Levels Blood Pressure Levels Cholesterol Levels  \
2               Paleo           7.31                154/97   HDL: 59, LDL: 95   
12     Non-Vegetarian           9.19                114/67   HDL: 80, LDL: 83   
19        Gluten-Free           0.46                170/64  HDL: 72, LDL: 174   
25         Vegetarian           6.48                151/65  HDL: 73, LDL: 111   
37        Gluten-Free           7.86                148/74   HDL: 30, LDL: 62   
...               ...            ...                   ...                ...   
14972     Pescatarian           9.51                113/65  HDL: 55, LDL: 179   
14974     Pescatarian           2.26                159/94   HDL: 42, LDL: 99   
14981           Paleo           2.69                135/66  HDL: 58, LDL: 161   
14983      Vegetarian           6.79                136/66  HDL: 59, LDL: 172   
14991     Pescatarian           0.71               180/110   HDL: 33, LDL: 99   

                                                Symptoms  Diagnosis  \
2                                    Seizures, Dizziness     Stroke   
12                             Loss of Balance, Numbness     Stroke   
19                                              Seizures     Stroke   
25     Numbness, Loss of Balance, Numbness, Blurred V...     Stroke   
37                    Blurred Vision, Seizures, Weakness     Stroke   
...                                                  ...        ...   
14972                                                NaN  No Stroke   
14974                           Seizures, Severe Fatigue     Stroke   
14981  Blurred Vision, Headache, Severe Fatigue, Loss...     Stroke   
14983  Severe Fatigue, Severe Fatigue, Headache, Seiz...     Stroke   
14991                               Confusion, Confusion  No Stroke   

      New Stress Levels  
2                   7-8  
12                 9-10  
19                  0-1  
25                  6-7  
37                  7-8  
...                 ...  
14972              9-10  
14974               2-3  
14981               2-3  
14983               6-7  
14991               0-1  

[2662 rows x 23 columns]


def categorize_glucose_level(agl):
    if agl < 117.0:
        return 'Normal'
    elif agl < 137.0:
        return 'Pre-diabetic'
    else:
        return 'Diabetic'

young_adults.loc[:, 'Category'] = young_adults['Average Glucose Level'].apply(categorize_glucose_level)
print(young_adults)

       Patient ID     Patient Name  Age  Gender  Hypertension  Heart Disease  \
2           32145    Dhanush Balan   26    Male             1              1   
12          66924     Ahana  Lalla   30  Female             0              1   
19          23954     Taran Khatri   25    Male             0              0   
25          36975      Jhanvi Brar   24  Female             0              0   
37          94512       Anvi Salvi   23  Female             0              0   
...           ...              ...  ...     ...           ...            ...   
14972       11839    Chirag Kurian   30    Male             0              1   
14974       30150  Alisha Banerjee   20  Female             0              0   
14981       12323        Pari Ravi   25    Male             0              0   
14983       40381        Sana Goel   18  Female             0              0   
14991       90658      Samaira Raj   26    Male             0              1   

      Marital Status       Work Type Residence Type  Average Glucose Level  \
2            Married    Never Worked          Rural                 189.00   
12          Divorced  Government Job          Urban                 163.15   
19           Married         Private          Urban                  71.38   
25           Married   Self-employed          Urban                  79.89   
37            Single  Government Job          Rural                 164.72   
...              ...             ...            ...                    ...   
14972        Married   Self-employed          Rural                 126.94   
14974       Divorced    Never Worked          Rural                 101.36   
14981         Single         Private          Rural                  77.64   
14983         Single   Self-employed          Urban                  68.26   
14991        Married   Self-employed          Rural                 145.05   

       ...  Stroke History Family History of Stroke  Dietary Habits  \
2      ...               0                      Yes           Paleo   
12     ...               0                      Yes  Non-Vegetarian   
19     ...               0                      Yes     Gluten-Free   
25     ...               1                       No      Vegetarian   
37     ...               1                      Yes     Gluten-Free   
...    ...             ...                      ...             ...   
14972  ...               0                       No     Pescatarian   
14974  ...               0                      Yes     Pescatarian   
14981  ...               0                      Yes           Paleo   
14983  ...               1                       No      Vegetarian   
14991  ...               1                       No     Pescatarian   

      Stress Levels  Blood Pressure Levels Cholesterol Levels  \
2              7.31                 154/97   HDL: 59, LDL: 95   
12             9.19                 114/67   HDL: 80, LDL: 83   
19             0.46                 170/64  HDL: 72, LDL: 174   
25             6.48                 151/65  HDL: 73, LDL: 111   
37             7.86                 148/74   HDL: 30, LDL: 62   
...             ...                    ...                ...   
14972          9.51                 113/65  HDL: 55, LDL: 179   
14974          2.26                 159/94   HDL: 42, LDL: 99   
14981          2.69                 135/66  HDL: 58, LDL: 161   
14983          6.79                 136/66  HDL: 59, LDL: 172   
14991          0.71                180/110   HDL: 33, LDL: 99   

                                                Symptoms  Diagnosis  \
2                                    Seizures, Dizziness     Stroke   
12                             Loss of Balance, Numbness     Stroke   
19                                              Seizures     Stroke   
25     Numbness, Loss of Balance, Numbness, Blurred V...     Stroke   
37                    Blurred Vision, Seizures, Weakness     Stroke   
...                                                  ...        ...   
14972                                                NaN  No Stroke   
14974                           Seizures, Severe Fatigue     Stroke   
14981  Blurred Vision, Headache, Severe Fatigue, Loss...     Stroke   
14983  Severe Fatigue, Severe Fatigue, Headache, Seiz...     Stroke   
14991                               Confusion, Confusion  No Stroke   

      New Stress Levels      Category  
2                   7-8      Diabetic  
12                 9-10      Diabetic  
19                  0-1        Normal  
25                  6-7        Normal  
37                  7-8      Diabetic  
...                 ...           ...  
14972              9-10  Pre-diabetic  
14974               2-3        Normal  
14981               2-3        Normal  
14983               6-7        Normal  
14991               0-1      Diabetic  

[2662 rows x 24 columns]

<ipython-input-1696-ad58dde6d0c3>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  young_adults.loc[:, 'Category'] = young_adults['Average Glucose Level'].apply(categorize_glucose_level)


cont = pd.crosstab(young_adults['Category'], young_adults['Diagnosis'])
print(cont)

Diagnosis     No Stroke  Stroke
Category                       
Diabetic            592     616
Normal              541     527
Pre-diabetic        199     187


cont.plot(kind = 'bar', title='Average Glucose Level vs Stroke (Contingency Table))', xlabel='Average Glucose Level Category', ylabel='Number of Patients')

<Axes: title={'center': 'Average Glucose Level vs Stroke (Contingency Table))'}, xlabel='Average Glucose Level Category', ylabel='Number of Patients'>


ob = spy.stats.contingency.chi2_contingency(cont)
print(ob.pvalue)

0.5969342118907002


print(stress_table)

Diagnosis          No Stroke  Stroke
New Stress Levels                   
0-1                      148     131
1-2                      111     155
2-3                      130     135
3-4                      126     124
4-5                      139     112
5-6                      143     138
6-7                      136     127
7-8                      153     148
8-9                      117     141
9-10                     129     119


data = {
    "Stress Levels": [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5],
    "Stroke Incidence": [131, 155, 135, 124, 112, 138, 127, 148, 141, 119]
}
stress2_df = pd.DataFrame(data)


X = stress2_df["Stress Levels"].values.reshape(-1, 1)
y = stress2_df["Stroke Incidence"].values

model = LinearRegression()


model.fit(X, y)

LinearRegression()

LinearRegression()


coef = model.coef_[0]
intercept = model.intercept_
r_squared = model.score(X, y)

print("Regression Coefficient (Slope):", coef)
print("\n Intercept:", intercept)
print("\n R-squared:", r_squared)

Regression Coefficient (Slope): -0.6424242424242423

 Intercept: 136.21212121212122

 R-squared: 0.02182595182595204


plt.scatter(stress2_df["Stress Levels"], stress2_df["Stroke Incidence"], color='blue')
plt.plot(stress2_df["Stress Levels"], model.predict(X), color='red')
plt.xlabel('Stress Levels')
plt.ylabel('Stroke Incidence')
plt.title('Stress Levels vs Stroke Incidence')
plt.show()


# Create a new dataframe
alcohol_stroke_ml_data = young_adults[["Alcohol Intake", "Diagnosis"]]

# One-hot encode the alcohol intake status
all_consumption_levels = alcohol_stroke_ml_data['Alcohol Intake'].str.get_dummies(sep=', ')
alcohol_stroke_ml_data = pd.concat([alcohol_stroke_ml_data, all_consumption_levels], axis=1).drop(columns=['Alcohol Intake'])

# Change the Stroke / No Stroke labels to 1 and 0
alcohol_stroke_ml_data.replace({"Stroke":1, "No Stroke":0}, inplace=True)


# Create a decision tree model
alcohol_model = DecisionTreeClassifier()

X_alc = alcohol_stroke_ml_data.drop('Diagnosis', axis=1)
Y_alc = alcohol_stroke_ml_data['Diagnosis']

x_alc_train, x_alc_test, y_alc_train, y_alc_test = train_test_split(X_alc, Y_alc, test_size= 0.2, random_state = 42)

alcohol_model.fit(x_alc_train, y_alc_train)

DecisionTreeClassifier()

DecisionTreeClassifier()


# Evaluate the Performance
predict_al = alcohol_model.predict(x_alc_test)


accuracy = accuracy_score(y_alc_test, predict_al)

print(f"Accuracy of predictions using DecisionTree: {accuracy}")
print(classification_report(y_alc_test, predict_al))

Accuracy of predictions using DecisionTree: 0.4896810506566604
              precision    recall  f1-score   support

           0       0.49      0.49      0.49       267
           1       0.49      0.49      0.49       266

    accuracy                           0.49       533
   macro avg       0.49      0.49      0.49       533
weighted avg       0.49      0.49      0.49       533


al_matrix = confusion_matrix(y_alc_test, predict_al)
plt.figure(figsize=(8, 6))
seas.heatmap(al_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
plt.title('Confusion Matrix of Decision Tree Predictions')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


# Feature : Smoking Status (categorical categories)
print (young_adults['Smoking Status'].unique())
# Label(prediction) : Diagnosis (categorical categories)
print (young_adults['Diagnosis'].unique())

#print(young_adults)
# Create a Data frame for our model : add additional features (not including other factors) to prevent underfitting
smoking_stroke_ml_df = young_adults[["Smoking Status", "Diagnosis"]]


# Convert the categorical values to numeric values
def convert_status2 (status) :
  if status == "Non-smoker" :
    return 0
  elif status == "Formerly Smoked" :
    return 1
  elif status == "Currently Smokes" :
    return 2

def convert_diagnosis2 (status) :
  if status == "No Stroke" :
    return 0
  elif status == "Stroke" :
    return 1

# Display the converted values
smoking_stroke_ml_df.loc[:,"Smoking Status"] = smoking_stroke_ml_df["Smoking Status"].apply(convert_status2)
smoking_stroke_ml_df.loc[:,"Diagnosis"] = smoking_stroke_ml_df["Diagnosis"].apply(convert_diagnosis2).astype(int)


print(smoking_stroke_ml_df)

# Display the unique values of the converted feature
print(smoking_stroke_ml_df['Diagnosis'].unique())
print(smoking_stroke_ml_df['Smoking Status'].unique())

['Formerly Smoked' 'Non-smoker' 'Currently Smokes']
['Stroke' 'No Stroke']
      Smoking Status Diagnosis
2                  1         1
12                 1         1
19                 0         1
25                 2         1
37                 1         1
...              ...       ...
14972              2         0
14974              1         1
14981              2         1
14983              2         1
14991              0         0

[2662 rows x 2 columns]
[1 0]
[1 0 2]


# Split the Data to training and testing data with testing being  0.2
X = smoking_stroke_ml_df[["Smoking Status"]]
y = smoking_stroke_ml_df['Diagnosis'] >= 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


# Create our model
smoking_model = RandomForestClassifier()
# Train the data using RandomForestClassifier -> ensembling
smoking_model.fit(X_train, y_train)

RandomForestClassifier()

RandomForestClassifier()


# Evaluate the Performance
predictions = smoking_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

print(f"Accuracy of prediction: {accuracy}")
print(classification_report(y_test, predictions))

print("The accuracy of this model is not overall ideal")

Accuracy of prediction: 0.4521575984990619
              precision    recall  f1-score   support

       False       0.47      0.66      0.55       267
        True       0.42      0.24      0.31       266

    accuracy                           0.45       533
   macro avg       0.44      0.45      0.43       533
weighted avg       0.44      0.45      0.43       533

The accuracy of this model is not overall ideal


# Use the confusion matrix to visualize the positives (true false) and negatives (true false)
smoking_conf_matrix = confusion_matrix(y_test, predictions)

plt.figure(figsize=(12, 6))
plt.xlabel('Prediction')
plt.ylabel('Actual')
plt.title('Confusion Matrix of Smoking Status VS Diagnosis of Stroke')
seas.heatmap(smoking_conf_matrix, annot=True, fmt="d", xticklabels=["No Stroke", "Stroke"], yticklabels=["No Stroke", "Stroke"])
plt.show()


# Use the correlation matrix to comprehend why the model resulted in having poor performance
smoking_corr_matrix = smoking_stroke_ml_df.corr()

plt.figure(figsize = (12,6))
plt.title('Correlation Matrix')
seas.heatmap(smoking_corr_matrix, annot = True, fmt = '.4f', linewidths = 0.5)
plt.show()


print(young_adults)

       Patient ID     Patient Name  Age  Gender  Hypertension  Heart Disease  \
2           32145    Dhanush Balan   26    Male             1              1   
12          66924     Ahana  Lalla   30  Female             0              1   
19          23954     Taran Khatri   25    Male             0              0   
25          36975      Jhanvi Brar   24  Female             0              0   
37          94512       Anvi Salvi   23  Female             0              0   
...           ...              ...  ...     ...           ...            ...   
14972       11839    Chirag Kurian   30    Male             0              1   
14974       30150  Alisha Banerjee   20  Female             0              0   
14981       12323        Pari Ravi   25    Male             0              0   
14983       40381        Sana Goel   18  Female             0              0   
14991       90658      Samaira Raj   26    Male             0              1   

      Marital Status       Work Type Residence Type  Average Glucose Level  \
2            Married    Never Worked          Rural                 189.00   
12          Divorced  Government Job          Urban                 163.15   
19           Married         Private          Urban                  71.38   
25           Married   Self-employed          Urban                  79.89   
37            Single  Government Job          Rural                 164.72   
...              ...             ...            ...                    ...   
14972        Married   Self-employed          Rural                 126.94   
14974       Divorced    Never Worked          Rural                 101.36   
14981         Single         Private          Rural                  77.64   
14983         Single   Self-employed          Urban                  68.26   
14991        Married   Self-employed          Rural                 145.05   

       ...  Stroke History Family History of Stroke  Dietary Habits  \
2      ...               0                      Yes           Paleo   
12     ...               0                      Yes  Non-Vegetarian   
19     ...               0                      Yes     Gluten-Free   
25     ...               1                       No      Vegetarian   
37     ...               1                      Yes     Gluten-Free   
...    ...             ...                      ...             ...   
14972  ...               0                       No     Pescatarian   
14974  ...               0                      Yes     Pescatarian   
14981  ...               0                      Yes           Paleo   
14983  ...               1                       No      Vegetarian   
14991  ...               1                       No     Pescatarian   

      Stress Levels  Blood Pressure Levels Cholesterol Levels  \
2              7.31                 154/97   HDL: 59, LDL: 95   
12             9.19                 114/67   HDL: 80, LDL: 83   
19             0.46                 170/64  HDL: 72, LDL: 174   
25             6.48                 151/65  HDL: 73, LDL: 111   
37             7.86                 148/74   HDL: 30, LDL: 62   
...             ...                    ...                ...   
14972          9.51                 113/65  HDL: 55, LDL: 179   
14974          2.26                 159/94   HDL: 42, LDL: 99   
14981          2.69                 135/66  HDL: 58, LDL: 161   
14983          6.79                 136/66  HDL: 59, LDL: 172   
14991          0.71                180/110   HDL: 33, LDL: 99   

                                                Symptoms  Diagnosis  \
2                                    Seizures, Dizziness     Stroke   
12                             Loss of Balance, Numbness     Stroke   
19                                              Seizures     Stroke   
25     Numbness, Loss of Balance, Numbness, Blurred V...     Stroke   
37                    Blurred Vision, Seizures, Weakness     Stroke   
...                                                  ...        ...   
14972                                                NaN  No Stroke   
14974                           Seizures, Severe Fatigue     Stroke   
14981  Blurred Vision, Headache, Severe Fatigue, Loss...     Stroke   
14983  Severe Fatigue, Severe Fatigue, Headache, Seiz...     Stroke   
14991                               Confusion, Confusion  No Stroke   

      New Stress Levels      Category  
2                   7-8      Diabetic  
12                 9-10      Diabetic  
19                  0-1        Normal  
25                  6-7        Normal  
37                  7-8      Diabetic  
...                 ...           ...  
14972              9-10  Pre-diabetic  
14974               2-3        Normal  
14981               2-3        Normal  
14983               6-7        Normal  
14991               0-1      Diabetic  

[2662 rows x 24 columns]


glucose_ml_df = young_adults[["Age", "Average Glucose Level", "Diagnosis"]]

def stroke_conversion (status) :
  if status == "No Stroke" :
    return 0
  elif status == "Stroke" :
    return 1

glucose_ml_df.loc[:,"Diagnosis"] = glucose_ml_df["Diagnosis"].apply(stroke_conversion).astype(int)

print(glucose_ml_df)

       Age  Average Glucose Level Diagnosis
2       26                 189.00         1
12      30                 163.15         1
19      25                  71.38         1
25      24                  79.89         1
37      23                 164.72         1
...    ...                    ...       ...
14972   30                 126.94         0
14974   20                 101.36         1
14981   25                  77.64         1
14983   18                  68.26         1
14991   26                 145.05         0

[2662 rows x 3 columns]


Xg = glucose_ml_df[['Average Glucose Level', 'Age']]
Yg = glucose_ml_df['Diagnosis'] >= 1

Xg_train, Xg_test, Yg_train, Yg_test = train_test_split(Xg, Yg, test_size= 0.2, random_state = 42)

scaler = StandardScaler()
Xg_train = scaler.fit_transform(Xg_train)
Xg_test = scaler.fit_transform(Xg_test)


glucose_ml_model = LogisticRegression()
glucose_ml_model.fit(Xg_train, Yg_train)

LogisticRegression()

LogisticRegression()


pred_g = glucose_ml_model.predict(Xg_test)
accu_g = accuracy_score(Yg_test, pred_g)

print(classification_report(Yg_test, pred_g))
print(f"Accuracy Score: {accu_g}")

              precision    recall  f1-score   support

       False       0.52      0.52      0.52       267
        True       0.52      0.53      0.52       266

    accuracy                           0.52       533
   macro avg       0.52      0.52      0.52       533
weighted avg       0.52      0.52      0.52       533

Accuracy Score: 0.5215759849906192


glucose_correlation_matrix = glucose_ml_df.corr()

plt.figure(figsize = (12,6))
plt.title('Correlation Matrix for Logisitic Regression Model on Average Glucose Levels vs Stroke Occurrence Based On Age')
seas.heatmap(glucose_correlation_matrix, annot = True, fmt = '.4f', xticklabels=["Age", "Average Glucose Level", "Diagnosis"], yticklabels=["Age", "Average Glucose Level", "Diagnosis"], linewidths = 0.5)
plt.show()


# Begin with the young_adults dataset
# We don't need the patient names or IDs for the analysis, so we can remove those
main_dataset = young_adults.copy()
main_dataset = main_dataset.drop(['Patient Name', 'Patient ID'], axis=1)
# In our earlier analysis, we reformatted the Stress Levels into discrete ranges
# For this, we can use the raw stress levels instead
main_dataset = main_dataset.drop(['New Stress Levels'], axis=1)

# We also do not need the diabetes category, so we can drop this as well
main_dataset = main_dataset.drop(['Category'], axis=1)


# To aid the analysis, we can split Cholesterol Levels into HDL and LDL levels
main_dataset[['Cholesterol Levels HDL', 'Cholesterol Levels LDL']] = main_dataset['Cholesterol Levels'].str.split(',', expand=True)
main_dataset = main_dataset.drop(['Cholesterol Levels'], axis=1)

# To make sure the values are treated as numbers, we can remove the "HDL: " and "LDL: " text
# and convert the columns to numerical values
main_dataset['Cholesterol Levels HDL'] = main_dataset['Cholesterol Levels HDL'].apply(lambda x: int(x[4:]))
main_dataset['Cholesterol Levels LDL'] = main_dataset['Cholesterol Levels LDL'].apply(lambda x: int(x[5:]))

# Next, to treat blood pressure levels as numbers, we can split it into the systolic and diastolic numbers
main_dataset[['Blood Pressure Systolic', 'Blood Pressure Diastolic']] = main_dataset['Blood Pressure Levels'].str.split('/', expand=True)
main_dataset = main_dataset.drop(['Blood Pressure Levels'], axis=1)
main_dataset['Blood Pressure Systolic'] = main_dataset['Blood Pressure Systolic'].apply(lambda x: int(x))
main_dataset['Blood Pressure Diastolic'] = main_dataset['Blood Pressure Diastolic'].apply(lambda x: int(x))


# To consider the impact of individual symptoms, we must one-hot encode the symptoms.
# To do so, we can split the Symptoms column into distinct symptoms, then run an encoder
all_symptoms = main_dataset['Symptoms'].str.get_dummies(sep=', ')
main_dataset = pd.concat([main_dataset, all_symptoms], axis=1).drop(columns=['Symptoms'])

# Finally, to feed our data to the random forest model, we must one-hot encode our remaining categorical variables
# Sex of the patient:
genders = main_dataset['Gender'].str.get_dummies()
main_dataset = pd.concat([main_dataset, genders], axis=1).drop(columns=['Gender'])

# Marital Status
marital_statuses = main_dataset['Marital Status'].str.get_dummies()
main_dataset = pd.concat([main_dataset, marital_statuses], axis=1).drop(columns=['Marital Status'])

# Work Type
work_types = main_dataset['Work Type'].str.get_dummies()
main_dataset = pd.concat([main_dataset, work_types], axis=1).drop(columns=['Work Type'])

# Residence Type
residence_types = main_dataset['Residence Type'].str.get_dummies()
main_dataset = pd.concat([main_dataset, residence_types], axis=1).drop(columns=['Residence Type'])

# Smoking Statuses
smoking_statuses = main_dataset['Smoking Status'].str.get_dummies()
main_dataset = pd.concat([main_dataset, smoking_statuses], axis=1).drop(columns=['Smoking Status'])

# Alcohol Intakes
alcohol_intakes = main_dataset['Alcohol Intake'].str.get_dummies()
main_dataset = pd.concat([main_dataset, alcohol_intakes], axis=1).drop(columns=['Alcohol Intake'])

# Dietary Habits
diets = main_dataset['Dietary Habits'].str.get_dummies()
main_dataset = pd.concat([main_dataset, diets], axis=1).drop(columns=['Dietary Habits'])

# Physical Activity
physical_activity_levels = main_dataset['Physical Activity'].str.get_dummies()
main_dataset = pd.concat([main_dataset, physical_activity_levels], axis=1).drop(columns=['Physical Activity'])
# Change column names to be more specific
main_dataset = main_dataset.rename(columns={"High": "High Physical Activity", "Low": "Low Physical Activity", "Moderate":"Moderate Physical Activity"})

# Family History of Stroke - replace Yes with 1 and No with 0
main_dataset['Family History of Stroke'].replace({'Yes': 1, 'No': 0}, inplace=True)

# Final cleaning to treat numerical values as integers/floats
main_dataset['Age'] = main_dataset['Age'].apply(lambda x: int(x))
main_dataset['Average Glucose Level'] = main_dataset['Average Glucose Level'].apply(lambda x: float(x))
main_dataset['Body Mass Index (BMI)'] = main_dataset['Body Mass Index (BMI)'].apply(lambda x: float(x))
main_dataset['Stress Levels'] = main_dataset['Stress Levels'].apply(lambda x: float(x))

#main_dataset.columns
main_dataset


# Now we can begin preparing the ML model
X_main_data = main_dataset.drop('Diagnosis', axis=1)
Y_main_data = main_dataset['Diagnosis']
x_main_train, x_main_test, y_main_train, y_main_test = train_test_split(X_main_data, Y_main_data, test_size=.2, random_state=4)

# Features of the model -> added interaction features
print(x_main_train.columns)

Index(['Age', 'Hypertension', 'Heart Disease', 'Average Glucose Level',
       'Body Mass Index (BMI)', 'Stroke History', 'Family History of Stroke',
       'Stress Levels', 'Cholesterol Levels HDL', 'Cholesterol Levels LDL',
       'Blood Pressure Systolic', 'Blood Pressure Diastolic', 'Blurred Vision',
       'Confusion', 'Difficulty Speaking', 'Dizziness', 'Headache',
       'Loss of Balance', 'Numbness', 'Seizures', 'Severe Fatigue', 'Weakness',
       'Female', 'Male', 'Divorced', 'Married', 'Single', 'Government Job',
       'Never Worked', 'Private', 'Self-employed', 'Rural', 'Urban',
       'Currently Smokes', 'Formerly Smoked', 'Non-smoker', 'Frequent Drinker',
       'Never', 'Rarely', 'Social Drinker', 'Gluten-Free', 'Keto',
       'Non-Vegetarian', 'Paleo', 'Pescatarian', 'Vegan', 'Vegetarian',
       'High Physical Activity', 'Low Physical Activity',
       'Moderate Physical Activity'],
      dtype='object')


# Standardize the features
scaler = StandardScaler()
x_main_train = scaler.fit_transform(x_main_train)
x_main_test = scaler.fit_transform(x_main_test)

# Models
models = {
    'KNN': KNeighborsClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'LogisticRegression':LogisticRegression(max_iter=1000000),
    'RandomForest': RandomForestClassifier()
}

# Apply K-fold cross validation and evaluate to alleviate overfitting
skf = StratifiedKFold(n_splits= 5, shuffle=True, random_state=42)

for model_name, model in models.items():
  accuracy = cross_val_score(model, x_main_train, y_main_train, cv = skf)

  # Display the mean and standard deviation of the cross validation accuracy
  print(f"{model_name} \nMean: {accuracy.mean()} \nStandard Deviation: {accuracy.std()}\n")

KNN 
Mean: 0.4894194973764153 
Standard Deviation: 0.023114698595755705

DecisionTree 
Mean: 0.5025893399613366 
Standard Deviation: 0.02539913400661669

LogisticRegression 
Mean: 0.5049400718033693 
Standard Deviation: 0.024654173650678273

RandomForest 
Mean: 0.512448494890914 
Standard Deviation: 0.016121716175018355


# Train each model using the training data
for model_name, model in models.items():
  model.fit(x_main_train, y_main_train)

# Evaluate the performance of each model
for model_name, model in models.items():
  predicted = model.predict(x_main_test)
  print(f"Accuracy of {model_name}: {accuracy_score(y_main_test, predicted)}")
  print(
    f"Classification report for classifier {model_name}:\n"
    f"{classification_report(y_main_test, predicted)}\n"
  )

Accuracy of KNN: 0.46904315196998125
Classification report for classifier KNN:
              precision    recall  f1-score   support

   No Stroke       0.47      0.51      0.49       263
      Stroke       0.47      0.43      0.45       270

    accuracy                           0.47       533
   macro avg       0.47      0.47      0.47       533
weighted avg       0.47      0.47      0.47       533


Accuracy of DecisionTree: 0.4727954971857411
Classification report for classifier DecisionTree:
              precision    recall  f1-score   support

   No Stroke       0.46      0.44      0.45       263
      Stroke       0.48      0.51      0.49       270

    accuracy                           0.47       533
   macro avg       0.47      0.47      0.47       533
weighted avg       0.47      0.47      0.47       533


Accuracy of LogisticRegression: 0.46904315196998125
Classification report for classifier LogisticRegression:
              precision    recall  f1-score   support

   No Stroke       0.46      0.50      0.48       263
      Stroke       0.47      0.44      0.45       270

    accuracy                           0.47       533
   macro avg       0.47      0.47      0.47       533
weighted avg       0.47      0.47      0.47       533


Accuracy of RandomForest: 0.5196998123827392
Classification report for classifier RandomForest:
              precision    recall  f1-score   support

   No Stroke       0.51      0.54      0.53       263
      Stroke       0.53      0.50      0.51       270

    accuracy                           0.52       533
   macro avg       0.52      0.52      0.52       533
weighted avg       0.52      0.52      0.52       533


# Visualize the performance using the confusion matrix
maindata_confusion_matrix = confusion_matrix(y_main_test, predicted)

plt.figure(figsize=(12, 6))
plt.xlabel('Prediction')
plt.ylabel('Actual')
plt.title('Confusion Matrix for the prediction of the main dataset on the diagnosis of stroke')
seas.heatmap(maindata_confusion_matrix, annot=True, fmt="d", xticklabels=["No Stroke", "Stroke"], yticklabels=["No Stroke", "Stroke"])
plt.show()

	Age	Hypertension	Heart Disease	Average Glucose Level	Body Mass Index (BMI)	Stroke History	Family History of Stroke	Stress Levels	Diagnosis	Cholesterol Levels HDL	...	Gluten-Free	Keto	Non-Vegetarian	Paleo	Pescatarian	Vegan	Vegetarian	High Physical Activity	Low Physical Activity	Moderate Physical Activity
2	26	1	1	189.00	20.32	0	1	7.31	Stroke	59	...	0	0	0	1	0	0	0	1	0	0
12	30	0	1	163.15	19.36	0	1	9.19	Stroke	80	...	0	0	1	0	0	0	0	0	0	1
19	25	0	0	71.38	39.00	0	1	0.46	Stroke	72	...	1	0	0	0	0	0	0	0	0	1
25	24	0	0	79.89	17.58	1	0	6.48	Stroke	73	...	0	0	0	0	0	0	1	1	0	0
37	23	0	0	164.72	31.56	1	1	7.86	Stroke	30	...	1	0	0	0	0	0	0	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
14972	30	0	1	126.94	36.08	0	0	9.51	No Stroke	55	...	0	0	0	0	1	0	0	0	0	1
14974	20	0	0	101.36	21.15	0	1	2.26	Stroke	42	...	0	0	0	0	1	0	0	1	0	0
14981	25	0	0	77.64	23.88	0	1	2.69	Stroke	58	...	0	0	0	1	0	0	0	0	1	0
14983	18	0	0	68.26	36.48	1	0	6.79	Stroke	59	...	0	0	0	0	0	0	1	0	0	1
14991	26	0	1	145.05	35.94	1	0	0.71	No Stroke	33	...	0	0	0	0	1	0	0	0	1	0

**How do lifestyle factors such as stress, alcohol consumption, smoking, and unhealthy eating behaviors correlate with the rising stroke incidence in young adults?**¶

**How do lifestyle factors such as stress, alcohol consumption, smoking, and unhealthy eating behaviors correlate with the rising stroke incidence in young adults?**

Summer 2024 Data Science Project¶

Summer 2024 Data Science Project

**Table of Contents:**

**I. Introduction**

**II. Data Curation and Exploratory Data Analysis**

**1. Stress Data:**

**Overall Summary For This Dataset:**

**1A. Data Preprocessing**

**1B. Data Exploration**

**2. Alcohol Consumption Data:**

**Overall Summary For This Dataset:**

**2A. Data Preprocessing**

**2B. Data Exploration**

**3. Smoking Data:**

**Overall Summary For This Dataset:**

**3A. Data Preprocessing**

**3B. Data Exploration**

**4. Unhealthy Eating Behaviors Data:**

**Overall Summary For This Dataset:**

**4A. Data Preprocessing**

**4B. Data Exploration**

**5. Stroke Data:**

**Overall Summary For This Dataset:**

5A. Data Preprocessing</h2>

**5B. Data Exploration**

**5B1. Stress Levels vs Stroke Diagnosis**

**5B2. Alcohol Intake vs Stroke Diagnosis**

**5B3. Smoking Status vs Stroke Diagnosis**

**5B4. Average Glucose Level vs Stroke Diagnosis**

**III. Machine Learning Analysis and Visualization**

**1. Stress Levels vs Stroke Diagnosis**

**2. Alcohol Intake vs Stroke Diagnosis**

**3. Smoking Status vs Stroke Diagnosis**

**4. Average Glucose Level vs Stroke Diagnosis**

**5. Main Dataset Analysis Using KNN, Decision Tree, Logistic Regression and Random Forest**

**IV. Insights and Conclusions**

**1. Stress Levels vs Stroke Diagnosis**

**2. Alcohol Intake vs Stroke Diagnosis**

**3. Smoking Status vs Stroke Diagnosis**

**4. Average Glucose Level vs Stroke Diagnosis**

**5. Main Dataset Analysis Using KNN, Decision Tree, Logistic Regression and Random Forest**

**6. Conclusion and Future Directions**

**V. References**

How do lifestyle factors such as stress, alcohol consumption, smoking, and unhealthy eating behaviors correlate with the rising stroke incidence in young adults?
¶

How do lifestyle factors such as stress, alcohol consumption, smoking, and unhealthy eating behaviors correlate with the rising stroke incidence in young adults?

Summer 2024 Data Science Project
¶

Table of Contents:

I. Introduction

II. Data Curation and Exploratory Data Analysis

1. Stress Data:

Overall Summary For This Dataset:

1A. Data Preprocessing

1B. Data Exploration

2. Alcohol Consumption Data:

Overall Summary For This Dataset:

2A. Data Preprocessing

2B. Data Exploration

3. Smoking Data:

Overall Summary For This Dataset:

3A. Data Preprocessing

3B. Data Exploration

4. Unhealthy Eating Behaviors Data:

Overall Summary For This Dataset:

4A. Data Preprocessing

4B. Data Exploration

5. Stroke Data:

Overall Summary For This Dataset:

5B. Data Exploration

5B1. Stress Levels vs Stroke Diagnosis

5B2. Alcohol Intake vs Stroke Diagnosis

5B3. Smoking Status vs Stroke Diagnosis

5B4. Average Glucose Level vs Stroke Diagnosis

III. Machine Learning Analysis and Visualization

1. Stress Levels vs Stroke Diagnosis

2. Alcohol Intake vs Stroke Diagnosis

3. Smoking Status vs Stroke Diagnosis

4. Average Glucose Level vs Stroke Diagnosis

5. Main Dataset Analysis Using KNN, Decision Tree, Logistic Regression and Random Forest

IV. Insights and Conclusions

1. Stress Levels vs Stroke Diagnosis

2. Alcohol Intake vs Stroke Diagnosis

3. Smoking Status vs Stroke Diagnosis

4. Average Glucose Level vs Stroke Diagnosis

5. Main Dataset Analysis Using KNN, Decision Tree, Logistic Regression and Random Forest

6. Conclusion and Future Directions

V. References