In [32]:
# !pip install pandas numpy scipy openpyxl statsmodels
Import¶
In [11]:
import pandas as pd
import numpy as np
import re
from scipy.stats import chi2_contingency, fisher_exact, ttest_ind
from openpyxl import load_workbook
Load Data¶
In [12]:
file_path = "IABP thesis master sheet edited version.xlsx"
df = pd.read_excel(file_path)
Clean Column Names¶
In [13]:
df.columns = (
df.columns
.str.strip()
.str.replace(r"[^\w]+", "_", regex=True)
.str.replace(r"_+", "_", regex=True)
.str.strip("_")
.str.lower()
)
df = df.loc[:, ~df.columns.duplicated()]
df.columns.tolist()
Out[13]:
['s_no', 'gender', 'age', 'bmi', 'smoking', 'diabetes', 'systemic_hypertension', 'dyslipidemia', 'ccs_angina_grade', 'nyha_grade', 'duration_of_symptoms', 'left_ventricular_ef', 'left_main', 'previous_cardiac_surgery', 'stemi_nstemi_unstable_angina', 'peripheral_vascular_disease', 'rft', 'lft', 'inr', 'platelets', 'on_anti_platelets', 'euro_score', 'surgery_elective_urgency_emergency', 'iabp_insertion_time', 'reinsertion_of_iabp', 'diagnosis', 'type_of_operation', 'number_of_grafts_i_ii_iii_iv', 'type_of_grafts_used_in_cabg', 'indications', 'gcs', 'power_of_all_4_limbs', 'peripheral_pulse_of_lower_limbs', 'echo_new_findings', 'gi_symptoms', 'lft_1', 'rft_1', 'airway_status', 'sheath', 'chest_x_ray_position_of_iabp', 'heparin_infuson_given_according_to_act', 'complications', 'abg', 'lactate_levels', 'vascular_complications', 'renal_dysfunction', 'compartment_syndrome', 'infection', 'visceral_injury', 'neuro_complications', 'balloon_rupture', 'day_of_iabp_removal', 'forced_iabp_removal_due_to_complications', 'number_of_days_in_icu', 'number_of_days_in_in_hospital', 'outcome']
Extract Comments (Data Dictionary)¶
In [14]:
wb = load_workbook(file_path)
ws = wb.active
comments_dict = {}
for row in ws.iter_rows():
for cell in row:
if cell.comment:
col_name = ws.cell(row=1, column=cell.column).value
comments_dict[col_name] = cell.comment.text
Parse Mappings¶
In [15]:
def parse_mapping(comment):
mapping = {}
if not comment:
return mapping
lines = comment.split("\n")
for line in lines:
line = line.strip()
match = re.match(r"(\d+)\s*[-: ]\s*(.+)", line)
if match:
key = int(match.group(1))
value = match.group(2).strip()
mapping[key] = value
return mapping
column_mappings = {}
for col, comment in comments_dict.items():
parsed = parse_mapping(comment)
if len(parsed) > 0:
column_mappings[col.upper()] = parsed
Create Readable Dataset¶
In [16]:
df_readable = df.copy()
for col in df.columns:
original_col = col.upper()
if original_col in column_mappings:
mapping = column_mappings[original_col]
df_readable[col] = df[col].map(mapping).fillna(df[col])
Helper Functions¶
In [17]:
def chi_square_test(col1, col2):
table = pd.crosstab(df[col1], df[col2])
if table.shape == (2,2):
_, p = fisher_exact(table)
else:
_, p, _, _ = chi2_contingency(table)
return p, table
def t_test(col, outcome_col="outcome"):
group1 = df[df[outcome_col] == 0][col].dropna()
group2 = df[df[outcome_col] == 1][col].dropna()
t_stat, p = ttest_ind(group1, group2)
return p
Bronze -> Silver Checks¶
In [23]:
column_mappings
Out[23]:
{'GENDER': {0: 'Female', 1: 'Male'},
'AGE': {0: '<30 Y', 1: '31 to 50', 2: '51 to 60', 3: '>61'},
'BMI': {67: 'y'},
'SMOKING': {0: 'no', 1: 'yes'},
'DIABETES': {0: 'no', 1: 'yes'},
'SYSTEMIC \nHYPERTENSION': {0: 'no', 1: 'yes'},
'DYSLIPIDEMIA': {0: 'no', 1: 'yes'},
'CCS ANGINA\n GRADE': {1: 'class 1',
2: 'class 2',
3: 'class 3',
4: 'class 4'},
'NYHA \n GRADE': {1: 'Grade 1', 2: 'Grade 2', 3: 'Grade 3', 4: 'Grade 4'},
'DURATION OF \nSYMPTOMS': {0: '< 1 day',
1: '1 day to 7 days',
2: '7 days to 1 month',
3: '> 1 month'},
' LEFT MAIN': {0: 'no LM', 1: 'LM +'},
' STEMI / NSTEMI \n/ UNSTABLE ANGINA': {0: 'STEMI',
1: 'NSTEMI',
2: 'Unstable angina',
3: 'NA'},
' PERIPHERAL \nVASCULAR DISEASE ': {0: 'no', 1: 'yes'},
' PLATELETS ': {0: '<50k', 1: 'to to 100k', 2: '>1L'},
' ON ANTI-\nPLATELETS': {0: 'no',
1: 'yes stopped 4 days before surgery',
2: 'continued till surgery'},
' SURGERY ELECTIVE / URGENCY\n / EMERGENCY': {0: 'elective',
1: 'Urgent',
2: 'emergency'},
'REINSERTION \nOF IABP': {0: 'no', 1: 'yes'},
'DIAGNOSIS': {0: 'CAD',
1: 'CAD + MR',
2: 'CAD + AR',
3: 'CAD + MR/AR',
4: 'MR',
5: 'AR',
6: 'MS',
7: 'AS',
8: 'MR/AR/TR',
9: 'CAD + VSR',
10: 'Constritive pericaritis'},
'·NUMBER OF GRAFTS - \nI, II, III, IV ': {0: 'No Graft',
1: 'I',
2: 'II',
3: 'III',
4: 'IV',
5: 'V'},
'TYPE OF GRAFTS USED IN CABG ': {0: 'with only venous grafts',
1: 'with only arterial',
2: 'CABG with both venous and arterial grafts',
3: 'NA'},
'GCS': {0: '13 to 15 / 8 to 10 T',
1: '9 to 12 / 4 to 7T',
2: '3 to 8 / 2 to 3 T'},
' POWER OF \nALL 4 LIMBS': {0: 'normal', 1: 'abnormal', 2: 'NA (SEDATION)'},
'ECHO NEW \nFINDINGS': {0: 'no', 1: 'yes'},
'GI SYMPTOMS': {0: 'no',
1: 'Hematemesis',
2: 'hematochezia',
3: 'Abdominal pain,',
4: 'Abdominal distension'},
'INR': {0: '0.8 TO 1.2', 1: '1.3 to 2.5', 2: '> 2.5'},
'PLATELETS': {0: '<50k', 1: 'to to 100k', 2: '>1L'},
'AIRWAY \nSTATUS': {0: 'extubated', 1: 'intubated', 3: 'tracheostomy'},
'SHEATH': {0: 'With out', 1: 'With'},
' CHEST X-RAY POSITION \nOF IABP ': {0: 'norma', 1: 'abnormal position'},
'HEPARIN INFUSON GIVEN ACCORDING TO\nACT': {0: 'no', 1: 'yes'},
'ABG': {0: 'normal',
1: 'metabolic acidosis',
2: 'metabolic alkalosis',
3: 'respiratory acidosis',
4: 'respiratory alkalosis'},
'LACTATE \nLEVELS': {0: 'normal', 1: 'elivated'},
'INFECTION': {0: 'no', 1: 'yes local', 2: 'yes systems', 3: 'yes sepsis'},
'VISCERAL \nINJURY ': {0: 'no', 1: 'yes'},
'NEURO COMPLICATIONS': {0: 'no',
1: 'Paraplegia',
2: 'Stroke',
3: 'Foot drop'},
'BALLOON RUPTURE ': {0: 'no', 1: 'yes'},
'FORCED IABP REMOVAL \nDUE TO COMPLICATIONS': {0: 'no', 1: 'yes'},
' NUMBER OF \nDAYS IN IN HOSPITAL': {24: 'days'}}
Analysis Section¶
Demographics vs Complications¶
In [18]:
demographics = [
"gender", "age", "bmi", "smoking",
"diabetes", "systemic_hypertension", "dyslipidemia"
]
results = []
for col in demographics:
try:
p, table = chi_square_test(col, "vascular_complications")
results.append([col, "vascular_complications", p])
except:
pass
pd.DataFrame(results, columns=["Variable", "Target", "P_value"])
Out[18]:
| Variable | Target | P_value | |
|---|---|---|---|
| 0 | gender | vascular_complications | 0.505165 |
| 1 | age | vascular_complications | 0.899990 |
| 2 | bmi | vascular_complications | 0.002730 |
| 3 | smoking | vascular_complications | 0.689167 |
| 4 | diabetes | vascular_complications | 0.701812 |
| 5 | systemic_hypertension | vascular_complications | 0.691120 |
| 6 | dyslipidemia | vascular_complications | 0.786045 |
Indications & Surgery vs Outcome¶
In [19]:
cols = ["indications", "type_of_operation"]
results2 = []
for col in cols:
try:
p, table = chi_square_test(col, "outcome")
results2.append([col, "outcome", p])
except:
pass
pd.DataFrame(results2, columns=["Variable", "Target", "P_value"])
Out[19]:
| Variable | Target | P_value | |
|---|---|---|---|
| 0 | indications | outcome | 5.108896e-03 |
| 1 | type_of_operation | outcome | 4.689308e-18 |
Sheath vs Complications¶
In [20]:
p, table = chi_square_test("sheath", "vascular_complications")
print("P-value:", p)
table
P-value: 0.1001158149833359
Out[20]:
| vascular_complications | 0 | 1 | 6 |
|---|---|---|---|
| sheath | |||
| 0 | 65 | 3 | 3 |
| 1 | 43 | 6 | 0 |
Heparin vs Complications¶
In [21]:
p, table = chi_square_test("heparin_infuson_given_according_to_act", "vascular_complications")
print("P-value:", p)
table
P-value: 0.08187145311507667
Out[21]:
| vascular_complications | 0 | 1 | 6 |
|---|---|---|---|
| heparin_infuson_given_according_to_act | |||
| 0 | 43 | 7 | 1 |
| 1 | 65 | 2 | 2 |
Preoperative Data vs Outcome (T-Test)¶
In [24]:
continuous_vars = [
"left_ventricular_ef",
"platelets",
"inr",
"lactate_levels"
]
results3 = []
for col in continuous_vars:
try:
p = t_test(col)
results3.append([col, p])
except:
pass
pd.DataFrame(results3, columns=["Variable", "P_value"])
C:\Users\Asus\AppData\Local\Temp\ipykernel_41596\3643293397.py:16: SmallSampleWarning: One or more sample arguments is too small; all returned values will be NaN. See documentation for sample size requirements. t_stat, p = ttest_ind(group1, group2)
Out[24]:
| Variable | P_value | |
|---|---|---|
| 0 | left_ventricular_ef | NaN |
| 1 | platelets | NaN |
| 2 | inr | NaN |
| 3 | lactate_levels | NaN |
Pre vs Post Renal Failure¶
In [25]:
p, table = chi_square_test("rft", "renal_dysfunction")
print("P-value:", p)
table
P-value: 8.458793985488693e-05
Out[25]:
| renal_dysfunction | 0 | 1 | 2 |
|---|---|---|---|
| rft | |||
| 0 | 78 | 7 | 6 |
| 1 | 12 | 8 | 6 |
| 2 | 0 | 0 | 1 |
| 3 | 1 | 0 | 1 |
Complications Summary¶
In [26]:
complications = [
"vascular_complications",
"renal_dysfunction",
"infection",
"balloon_rupture"
]
for c in complications:
print("\n", c)
print(df_readable[c].value_counts())
vascular_complications vascular_complications 0 108 1 9 6 3 Name: count, dtype: int64 renal_dysfunction renal_dysfunction 0 91 1 15 2 14 Name: count, dtype: int64 infection infection no 110 yes sepsis 8 yes local 2 Name: count, dtype: int64 balloon_rupture balloon_rupture 0 120 Name: count, dtype: int64
In [28]:
final_results = pd.concat([
pd.DataFrame(results, columns=["Variable", "Target", "P_value"]),
pd.DataFrame(results2, columns=["Variable", "Target", "P_value"]),
pd.DataFrame(results3, columns=["Variable", "P_value"])
])
final_results.to_excel("final_analysis_results.xlsx", index=False)
df_readable.to_excel("clean_readable_data.xlsx", index=False)
Rough Area¶
In [31]:
comments_dict
Out[31]:
{'GENDER': 'Author:\n0 - Female\n1 - Male',
'Age': 'Author:\n0: <30 Y\n1 : 31 to 50\n2 : 51 to 60\n3 : >61',
'BMI': 'Author:\n67 y\nmale\nw - 83\nH - na',
'Smoking': 'Author:\n0 - no\n1 - yes',
'Diabetes': 'Author:\n0- no\n1- yes',
'Systemic \nhypertension': 'Author:\n0- no\n1- yes',
'Dyslipidemia': 'Author:\n0- no\n1- yes',
'ccs angina\n grade': 'Author:\n1 - class 1\n2 - class 2\n3 - class 3\n4 - class 4',
'NYHA \n grade': 'Author:\n1 - Grade 1\n2 - Grade 2\n3 - Grade 3\n4 - Grade 4',
'duration of \nsymptoms': 'Author:\n0 - < 1 day\n1 - 1 day to 7 days\n2 - 7 days to 1 month\n3- > 1 month',
'Left ventricular \nEF': 'Author:\n35\n',
' Left Main': 'Author:\n0 - no LM\n1 - LM +',
' Previous cardiac \nsurgery ': 'Author:\nPost CMV',
' STEMI / NSTEMI \n/ Unstable angina': 'Author:\n0 - STEMI\n1- NSTEMI\n2 - Unstable angina\n3- NA',
' Peripheral \nvascular disease ': 'Author:\n0 - no\n1 - yes',
'RFT': 'Author:\ncreat 1.3 \nbun 53',
'LFT': 'Author:\ntotal 2\nindirect 0.4\ndirect 1.6\n',
'\xa0 INR': 'Author:\n1.38',
' Platelets ': 'Author:\n0 - <50k\n1 - to to 100k\n2 - >1L',
' On Anti-\nplatelets': 'Author:\n0 - no\n1 - yes stopped 4 days before surgery\n2 - continued till surgery',
'\xa0 Euro score': 'Author:\n6.06',
' surgery Elective / Urgency\n / Emergency': 'Author:\n0 - elective\n1 - Urgent\n2 - emergency',
'IABP insertion time': 'Author:\nVF\n',
'Reinsertion \nof IABP': 'Author:\n0 -no\n1 - yes',
'Diagnosis': 'Author:\n0- CAD \n1- CAD + MR\n2- CAD + AR\n3- CAD + MR/AR\n 4- MR\n5- AR\n6- MS\n7- AS\n8 - MR/AR/TR\n9 - CAD + VSR\n10 - Constritive pericaritis',
'Type of operation ': 'Author:\nCABG with MVR\n',
'·Number of grafts - \nI, II, III, IV ': 'Author:\n0 - No Graft\n1- I\n2 - II \n3 - III\n4 - IV \n5- V\n',
'type of grafts used in CABG ': 'Author:\n0 - with only venous grafts \n1- with only arterial \ngrafts \n2 - CABG with both venous and arterial grafts\n3 - NA',
'Indications': 'Author:\npre op pulmonary edema',
'GCS': 'Author:\n0 - 13 to 15 / 8 to 10 T\n1 - 9 to 12 / 4 to 7T\n2 - 3 to 8 / 2 to 3 T',
' Power of \nall 4 limbs': 'Author:\n0 - normal\n1 - abnormal\n2- NA (SEDATION)',
'Peripheral pulse \nof lower limbs': 'Author:\nabsent in left leg, foot amputation due to PVD in 2016, \nIABP placed in right FA',
'Echo new \nfindings': 'Author:\n0 - no\n1 - yes\n',
'GI symptoms': 'Author:\n0- no\n1 - Hematemesis \n2- hematochezia \n3 - Abdominal pain, \n4- Abdominal distension\n',
'INR': 'Author:\n0- 0.8 TO 1.2\n1 - 1.3 to 2.5\n2 - > 2.5',
'Platelets': 'Author:\n0 - <50k\n1 - to to 100k\n2 - >1L',
'Airway \nstatus': 'Author:\n0 - extubated\n1 - intubated\n3- tracheostomy\n',
'Sheath': 'Author:\n0 -With out\n1- With',
' Chest x-ray position \nof IABP ': 'Author:\n0 - norma\n1 - abnormal position',
'Heparin infuson given according to\nACT': 'Author:\n0 -no\n1 - yes',
'ABG': 'Author:\n0 - normal\n1 - metabolic acidosis\n2 - metabolic alkalosis\n3 - respiratory acidosis\n4 - respiratory alkalosis\n',
'Lactate \nlevels': 'Author:\n0 - normal\n1 - elivated',
'Vascular \ncomplications': 'Author:\nvascular spasm',
'Renal \ndysfunction': 'Author:\nrabhdo due to compartmnet syndrm',
'Compartment\nsyndrome ': 'Author:\nyes',
'Infection': 'Author:\n0 - no\n1 - yes local\n2 - yes systems\n3 - yes sepsis',
'Visceral \ninjury ': 'Author:\n0 - no\n1 - yes',
'Neuro complications': 'Author:\n0- no\n1 - Paraplegia \n2 - Stroke\n3 - Foot drop\n\n',
'Balloon rupture ': 'Author:\n0 - no\n1 - yes',
'Day of IABP \nremoval': 'Author:\n3rd day of IABP\n',
'Forced IABP removal \ndue to complications': 'Author:\n 0 - no\n1 -yes',
' Number of \ndays in ICU': 'Author:\n23days',
' Number of \ndays in in hospital': 'Author:\n24 days\n',
'Outcome': 'Author:\nsepsis '}
In [ ]: