import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("data.csv", parse_dates=["date"])

# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Convert birth year into age.
latest_year = df['date'].dt.year.max()
df['age'] = latest_year - df['birth_year']
df.drop(columns=['birth_year'], inplace=True)

# Convert the player_key column to string
df['player_key'] = df['player_key'].astype(str)

# Sort by player and date to ensure chronological order
df = df.sort_values(['player_key', 'date'])

df.head(10)


# Look at single player data.
pk = np.random.choice(df['player_key'].unique())
player_data = df[df['player_key'] == pk].sort_values('date')
player_data


print(df.isnull().sum())
# No missing data.

player_key        0
date              0
turnover_sum      0
turnover_num      0
NGR_sum           0
deposit_sum       0
deposit_num       0
withdrawal_sum    0
withdrawal_num    0
login_num         0
age               0
dtype: int64


df.info()
df.describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99])
# Many columns have long tails. Some with extremely high max values.
# Many users do not withdraw funds, suggesting retained balances.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416939 entries, 222075 to 235267
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   player_key      416939 non-null  object        
 1   date            416939 non-null  datetime64[ns]
 2   turnover_sum    416939 non-null  float64       
 3   turnover_num    416939 non-null  int64         
 4   NGR_sum         416939 non-null  float64       
 5   deposit_sum     416939 non-null  int64         
 6   deposit_num     416939 non-null  int64         
 7   withdrawal_sum  416939 non-null  int64         
 8   withdrawal_num  416939 non-null  int64         
 9   login_num       416939 non-null  int64         
 10  age             416939 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(7), object(1)
memory usage: 38.2+ MB


# From our table from df.describe, we know that we have long tails and extremely high max values.
# Lets try capping numerical values between the 1st and 99th percentile.

excluded_columns = ['player_key', 'date']
target_columns = df.columns.difference(excluded_columns)

for col in target_columns:
    min_threshold = df[col].quantile(0.01)
    max_threshold = df[col].quantile(0.99)
    df[col] = np.where(df[col] < min_threshold, min_threshold, 
                           np.where(df[col] > max_threshold, max_threshold, df[col]))

df.plot(kind='hist', subplots=True, layout=(4, 4), figsize=(18, 15), bins=50, alpha=0.7)
plt.tight_layout()
plt.show()


# Utils.
def retrieve_random_player(dataframe):
    """Fetches session records for a randomly chosen player"""
    pk = np.random.choice(dataframe['player_key'].unique())
    df = dataframe[dataframe['player_key'] == pk].sort_values(by='date')
    return df

def calculate_and_plot_days_since_last(
    activity_column: str,df: pd.DataFrame 
):
    """Calculates the days since the last activity (where activity > 0).

    Args:
        df: The input DataFrame with 'player_key', 'date', and activity column.
        activity_column: The name of the column indicating activity (e.g.,
                         'login_num', 'turnover_num').
    """

    # Sort by player and date
    df = df.sort_values(by=["player_key", "date"])

    # Filter out rows where activity is greater than 0
    activity_dates = df[df[activity_column] > 0].copy()

    # Calculate the difference between activity dates
    days_since_column = f"days_since_last_{activity_column.replace('_num', '')}"
    activity_dates[days_since_column] = (
      activity_dates.groupby("player_key")["date"].diff().dt.days
    )

    # Remove outliers in the 'days_since_last_activity' column by capping
    # the values at the 1st and 99th percentile
    
    lower_bound = activity_dates[days_since_column].quantile(0.01)
    upper_bound = activity_dates[days_since_column].quantile(0.99)
    activity_dates[days_since_column] = np.clip(
        activity_dates[days_since_column], lower_bound, upper_bound
    )
    
    

    # Plot the distribution of 'days_since_last_activity'
    plt.figure(figsize=(10, 6))
    sns.histplot(activity_dates[days_since_column], bins=40, kde=True)
    # Add a vertical line at the churn window
    plt.axvline(
        x=30,
        color="red",
        linestyle="--",
        label=f"Churn Window (30 days)",
    )
    plot_title = f"Distribution of Days Since Last {activity_column.replace('_num', '').replace('_sum', '').title()}"
    plt.title(plot_title)
    plt.xlabel(f"Days Since Last {activity_column.replace('_num', '').title()}")
    plt.ylabel("Frequency")
    plt.show()

# Sort by player and date
df = df.sort_values(by=["player_key", "date"])


calculate_and_plot_days_since_last("login_num",df)


calculate_and_plot_days_since_last("turnover_sum",df)


calculate_and_plot_days_since_last("deposit_sum",df)


calculate_and_plot_days_since_last("withdrawal_sum",df)


calculate_and_plot_days_since_last("NGR_sum",df)


# Create our label called churn. Which we define as a player who has zero turnover_sum for more than 30 days.
# Filter out rows where activity is greater than 0
turnover_dates = df[df["turnover_sum"] > 0].copy()

# Calculate the difference between activity dates
turnover_dates["days_since_last_bet"] = (
  turnover_dates.groupby("player_key")["date"].diff().dt.days
)
# Fill NaN with zero. New players will not have previous bets.
turnover_dates["days_since_last_bet"] = turnover_dates["days_since_last_bet"].fillna(0)
turnover_dates['churned'] = turnover_dates["days_since_last_bet"] > 30 # Churn window
retrieve_random_player(turnover_dates)[["player_key","date","days_since_last_bet","churned"]]


# Merge  data back to the original dataframe
df = pd.merge(
    df,
    turnover_dates[["player_key", "date", "days_since_last_bet", "churned"]],
    on=["player_key", "date"],
    how="left",
)

# Fill NaN values in 'days_since_last_bet' with 0 and 'churned' with False
df["days_since_last_bet"] = df["days_since_last_bet"].fillna(0)
df["churned"] = df["churned"].fillna(False)
df.head(20)


# The data above shows when we consider a customer "churned" already.
# That is already too late for any possible preemptive CRM action.
# What we want to predict is if the customer will churn in the future 
# or not. So we create the label "will_churn" by lagging "churned".
df = df.copy() 
df["will_churn"] = df.groupby('player_key')['churned'].shift(-1)

# Drop the rows with NaN values in the 'will_churn' column
df = df.dropna(subset=['will_churn'])
retrieve_random_player(df)[["player_key","date","days_since_last_bet","churned", "will_churn"]]


# Average bet size
df["avg_bet_size"] = df["turnover_sum"] / df["turnover_num"]

# Average withdrawal size
df["avg_withdrawal_size"] = df["withdrawal_sum"] / df["withdrawal_num"]

# Average deposit size
df["avg_deposit_size"] = df["deposit_sum"] / df["deposit_num"]

# Turnover/NGR Ratio
df["turnover_to_ngr_ratio"] = df["turnover_sum"] / df["NGR_sum"]

# Rolling differences
df['rolling_login_num_diff'] = df.groupby('player_key')['login_num'].diff()
df['rolling_deposit_num_diff'] = df.groupby('player_key')['deposit_num'].diff()
df['rolling_withdrawal_num_diff'] = df.groupby('player_key')['withdrawal_num'].diff()
df['rolling_NGR_diff'] = df.groupby('player_key')['NGR_sum'].diff()

# Add current year, month, and day as features
df.set_index('date', inplace=True)
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df


df['will_churn'].value_counts(normalize=True)
# The churners class represents small part of the datapoints.

False    0.944826
True     0.055174
Name: will_churn, dtype: float64


turnover_features = ['turnover_sum', 'turnover_num', 'avg_bet_size']
deposit_features = ['deposit_sum', 'deposit_num', 'avg_deposit_size']
withdrawal_features = ['withdrawal_sum', 'withdrawal_num', 'avg_withdrawal_size']
ngr_features = ['NGR_sum','turnover_to_ngr_ratio']
other_features = ['age', 'year', 'month', 'day', 'login_num']
target = ['will_churn']

non_historical_features= turnover_features + deposit_features + withdrawal_features + ngr_features + other_features + target
df['will_churn'] = df['will_churn'].astype(int)  # Convert to integer (0 or 1)

# Plot the correlation matrix
correlation_matrix = df[non_historical_features].corr()
plt.figure(figsize=(18, 12))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# The correlation between the target variable and the features are small. 
# We could remove average_withdrawal size from our feature list given the high corr with withdrawal_sum.


df['will_churn']

date
2020-07-12    0
2020-07-28    0
2020-08-03    0
2020-08-07    0
2020-08-13    0
             ..
2021-12-02    0
2021-12-11    0
2021-12-21    1
2023-01-21    0
2023-01-24    1
Name: will_churn, Length: 404409, dtype: int64


# Create training and testing set.
split_date = '2022-11-12'
train_data = df[df.index < split_date]
test_data = df[df.index >= split_date]
print(f"Train set%: {len(train_data) / len(df) * 100:.2f}%")
print(f"Train set%: {len(test_data) / len(df) * 100:.2f}%")

Train set%: 69.99%
Train set%: 30.01%


print("Training set:")
print(train_data['will_churn'].value_counts(normalize=True))

print("\nTesting set:")
print(test_data['will_churn'].value_counts(normalize=True))

Training set:
0    0.939584
1    0.060416
Name: will_churn, dtype: float64

Testing set:
0    0.95705
1    0.04295
Name: will_churn, dtype: float64


df.columns

Index(['player_key', 'turnover_sum', 'turnover_num', 'NGR_sum', 'deposit_sum',
       'deposit_num', 'withdrawal_sum', 'withdrawal_num', 'login_num', 'age',
       'days_since_last_bet', 'churned', 'will_churn', 'avg_bet_size',
       'avg_withdrawal_size', 'avg_deposit_size', 'turnover_to_ngr_ratio',
       'rolling_login_num_diff', 'rolling_deposit_num_diff',
       'rolling_withdrawal_num_diff', 'rolling_NGR_diff', 'year', 'month',
       'day'],
      dtype='object')


from sklearn.ensemble import RandomForestClassifier
excluded_cols = ['player_key', 'will_churn', 'churned']

X_train = train_data.drop(columns=excluded_cols).values
y_train = train_data['will_churn'].values.astype(bool)

X_test = test_data.drop(columns=excluded_cols).values
y_test = test_data['will_churn'].values.astype(bool)


from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Handle NaN and inf values.
X_train[np.isinf(X_train)] = np.nan
X_test[np.isinf(X_test)] = np.nan
imputer = SimpleImputer(strategy="median")  # Since we have outliers, we use median.
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize Random Forest with basic parameters
rf_model = RandomForestClassifier(
    n_estimators=400,   # More trees can help stabilize the impact of class imbalance
    max_depth=9,       # Shallower trees generalize better
    min_samples_split=12,  # More data per decision split, helping with generalization
    min_samples_leaf=5,
    class_weight={0: 1, 1: 11}, # forces the model to prioritize churners more
    random_state=42,
    n_jobs=-1
)

# Fit the model
rf_model.fit(X_train_imputed, y_train)

RandomForestClassifier(class_weight={0: 1, 1: 11}, max_depth=9,
                       min_samples_leaf=5, min_samples_split=12,
                       n_estimators=400, n_jobs=-1, random_state=42)

RandomForestClassifier(class_weight={0: 1, 1: 11}, max_depth=9,
                       min_samples_leaf=5, min_samples_split=12,
                       n_estimators=400, n_jobs=-1, random_state=42)


from sklearn.metrics import confusion_matrix, classification_report, average_precision_score

# Make predictions on the test set
y_pred = rf_model.predict(X_test_imputed)
y_pred_proba = rf_model.predict_proba(X_test_imputed)[:, 1]

# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# 2. Precision, Recall, and F1-Score
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 3. Precision-Recall AUC
pr_auc = average_precision_score(y_test, y_pred_proba)
print("Precision-Recall AUC: {:.3f}".format(pr_auc))

Confusion Matrix:
[[96376 19784]
 [ 2250  2963]]

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.83      0.90    116160
        True       0.13      0.57      0.21      5213

    accuracy                           0.82    121373
   macro avg       0.55      0.70      0.55    121373
weighted avg       0.94      0.82      0.87    121373

Precision-Recall AUC: 0.145


from sklearn.metrics import precision_recall_curve
# Compute precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

# Plot the Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label=f'Random Forest (PR AUC = {pr_auc:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()


# Baseline. Always predict 'will_churn' = False.

baseline_preds = np.zeros_like(y_test)
# Evaluate the baseline model
baseline_cm = confusion_matrix(y_test, baseline_preds)
baseline_report = classification_report(y_test, baseline_preds, zero_division=0)

# 1. Confusion Matrix
cm = confusion_matrix(y_test, baseline_preds)
print("Confusion Matrix:")
print(cm)

# 2. Precision, Recall, and F1-Score
print("\nClassification Report:")
print(classification_report(y_test, baseline_preds))

Confusion Matrix:
[[116160      0]
 [  5213      0]]

Classification Report:
              precision    recall  f1-score   support

       False       0.96      1.00      0.98    116160
        True       0.00      0.00      0.00      5213

    accuracy                           0.96    121373
   macro avg       0.48      0.50      0.49    121373
weighted avg       0.92      0.96      0.94    121373

/Users/arvindeleon/projects/engine/venv/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/arvindeleon/projects/engine/venv/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/arvindeleon/projects/engine/venv/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

	player_key	date	turnover_sum	turnover_num	NGR_sum	deposit_sum	deposit_num	login_num	age
222075	-100156789188932904	2020-07-12	7.99	10	6.47	7	1	1	46
222069	-100156789188932904	2020-07-28	10.13	49	-2.33	4	1	1	46
55426	-100156789188932904	2020-08-03	4.36	46	0.24	0	0	1	46
55403	-100156789188932904	2020-08-07	12.06	32	-5.53	0	0	1	46
55378	-100156789188932904	2020-08-13	33.40	189	7.21	0	0	1	46
55394	-100156789188932904	2020-08-16	0.35	4	-0.47	0	0	1	46
55415	-100156789188932904	2020-08-20	10.55	99	5.32	0	0	1	46
55402	-100156789188932904	2020-08-21	0.00	0	0.00	0	0	1	46
55406	-100156789188932904	2020-09-01	0.00	0	0.00	0	0	1	46
222084	-100156789188932904	2020-09-03	12.67	55	8.73	9	1	3	46

	player_key	date	turnover_sum	turnover_num	NGR_sum	deposit_sum	deposit_num	withdrawal_sum	withdrawal_num	login_num	age
195795	-2290450308142927403	2020-12-30	300.24	334	-27.59	45	1	0	0	1	23
195777	-2290450308142927403	2020-12-31	484.99	354	10.35	45	1	89	1	0	23
195794	-2290450308142927403	2021-01-22	1313.36	365	33.97	134	1	90	1	1	23
43484	-2290450308142927403	2021-03-25	0.00	0	0.00	0	0	0	0	1	23
195786	-2290450308142927403	2021-08-21	160.58	286	34.83	44	1	0	0	1	23
43483	-2290450308142927403	2021-08-23	0.05	1	-5.76	0	0	0	0	0	23
195788	-2290450308142927403	2021-10-26	198.40	131	4.43	45	1	0	0	1	23
43489	-2290450308142927403	2021-10-27	86.94	104	38.88	0	0	0	0	0	23
43486	-2290450308142927403	2021-11-06	0.00	0	0.00	0	0	0	0	1	23
195774	-2290450308142927403	2021-12-05	194.38	443	50.79	65	1	0	0	1	23
43488	-2290450308142927403	2021-12-07	0.05	1	-4.03	0	0	0	0	0	23
195789	-2290450308142927403	2022-01-28	42.26	129	21.87	22	1	0	0	1	23
195791	-2290450308142927403	2022-03-04	94.64	176	36.24	42	1	0	0	1	23
195784	-2290450308142927403	2022-04-29	174.50	354	10.74	22	1	0	0	2	23
195793	-2290450308142927403	2022-05-09	104.91	317	34.25	43	1	0	0	1	23
195782	-2290450308142927403	2022-05-10	23.40	54	19.26	21	1	0	0	1	23
195775	-2290450308142927403	2022-05-27	76.84	118	39.19	43	1	0	0	1	23
195790	-2290450308142927403	2022-06-07	35.82	93	5.23	9	1	0	0	1	23
195772	-2290450308142927403	2022-06-24	32.00	135	16.12	21	1	0	0	1	23
195785	-2290450308142927403	2022-07-04	33.77	204	13.79	21	1	0	0	1	23
195781	-2290450308142927403	2022-07-09	26.88	110	17.12	21	1	0	0	1	23
195792	-2290450308142927403	2022-08-08	73.57	209	14.02	22	1	0	0	1	23
195778	-2290450308142927403	2022-09-13	41.98	158	15.68	21	1	0	0	1	23
195773	-2290450308142927403	2022-09-18	52.03	253	11.90	21	1	0	0	1	23
195783	-2290450308142927403	2022-09-26	67.53	215	33.71	41	1	0	0	1	23
195776	-2290450308142927403	2022-10-25	33.98	166	14.68	20	1	0	0	1	23
195780	-2290450308142927403	2022-10-29	23.75	86	8.89	41	1	0	0	1	23
43487	-2290450308142927403	2022-10-30	73.54	28	14.59	0	0	0	0	0	23
43490	-2290450308142927403	2022-12-09	30.81	83	11.23	0	0	0	0	1	23
195787	-2290450308142927403	2022-12-31	111.50	136	7.12	40	1	28	1	1	23
195779	-2290450308142927403	2023-01-05	20.06	88	5.18	20	1	12	1	1	23
43485	-2290450308142927403	2023-03-31	-0.16	0	-0.16	0	0	0	0	0	23

	turnover_sum	turnover_num	NGR_sum	deposit_sum	deposit_num	withdrawal_sum	withdrawal_num	login_num	age
count	416939.000000	416939.000000	416939.000000	416939.000000	416939.000000	416939.000000	416939.000000	416939.000000	416939.000000
mean	145.863234	124.166593	6.704987	26.120663	1.112981	19.035372	0.243012	1.528358	36.955936
std	973.176321	238.528250	166.889940	70.417036	1.269387	112.396788	0.527907	1.228827	15.469961
min	-26.820000	0.000000	-62045.020000	0.000000	0.000000	0.000000	0.000000	0.000000	18.000000
1%	0.000000	0.000000	-229.678600	0.000000	0.000000	0.000000	0.000000	0.000000	19.000000
25%	5.500000	4.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	23.000000
50%	27.020000	35.000000	4.420000	9.000000	1.000000	0.000000	0.000000	1.000000	33.000000
75%	100.275000	134.000000	16.300000	24.000000	1.000000	0.000000	0.000000	2.000000	49.000000
99%	1758.454600	1152.000000	212.500000	225.620000	6.000000	342.000000	2.000000	6.000000	75.000000
max	347559.240000	8378.000000	44386.840000	8423.000000	59.000000	20686.000000	24.000000	48.000000	95.000000

	player_key	date	days_since_last_bet	churned
173719	-4117809391698564203	2022-06-26	0.0	False
173703	-4117809391698564203	2022-06-27	1.0	False
173700	-4117809391698564203	2022-06-28	1.0	False
32032	-4117809391698564203	2022-07-03	5.0	False
173696	-4117809391698564203	2022-07-04	1.0	False
173701	-4117809391698564203	2022-07-05	1.0	False
32018	-4117809391698564203	2022-07-07	2.0	False
173692	-4117809391698564203	2022-07-12	5.0	False
173711	-4117809391698564203	2022-07-13	1.0	False
173709	-4117809391698564203	2022-07-14	1.0	False
173716	-4117809391698564203	2022-08-02	19.0	False
173706	-4117809391698564203	2022-08-03	1.0	False
173698	-4117809391698564203	2022-08-05	2.0	False
173715	-4117809391698564203	2022-08-08	3.0	False
173702	-4117809391698564203	2022-08-12	4.0	False
173695	-4117809391698564203	2022-09-02	21.0	False
173704	-4117809391698564203	2022-09-06	4.0	False
173710	-4117809391698564203	2022-09-24	18.0	False
173714	-4117809391698564203	2022-09-30	6.0	False
173697	-4117809391698564203	2022-10-01	1.0	False
173689	-4117809391698564203	2022-10-07	6.0	False
173725	-4117809391698564203	2022-10-14	7.0	False
32026	-4117809391698564203	2022-10-20	6.0	False
173720	-4117809391698564203	2022-10-29	9.0	False
173721	-4117809391698564203	2022-10-30	1.0	False
173723	-4117809391698564203	2022-11-14	15.0	False
173694	-4117809391698564203	2022-11-29	15.0	False
173708	-4117809391698564203	2022-12-10	11.0	False
173705	-4117809391698564203	2022-12-16	6.0	False
173712	-4117809391698564203	2022-12-17	1.0	False
173707	-4117809391698564203	2022-12-18	1.0	False
173699	-4117809391698564203	2023-01-01	14.0	False
173713	-4117809391698564203	2023-01-14	13.0	False
173691	-4117809391698564203	2023-01-18	4.0	False
173693	-4117809391698564203	2023-01-26	8.0	False
173717	-4117809391698564203	2023-01-27	1.0	False
173722	-4117809391698564203	2023-05-30	123.0	True
173724	-4117809391698564203	2023-06-23	24.0	False
173690	-4117809391698564203	2023-06-27	4.0	False
173718	-4117809391698564203	2023-06-28	1.0	False
32004	-4117809391698564203	2023-06-30	2.0	False

	player_key	date	days_since_last_bet	churned	will_churn
324954	5716654208414975603	2022-08-04	0.0	False	False
324955	5716654208414975603	2022-08-25	0.0	False	False
324956	5716654208414975603	2022-08-26	0.0	False	False
324957	5716654208414975603	2022-08-27	0.0	False	False
324958	5716654208414975603	2022-08-28	0.0	False	False
324959	5716654208414975603	2022-08-29	0.0	False	False
324960	5716654208414975603	2022-08-30	0.0	False	False
324961	5716654208414975603	2022-08-31	0.0	False	False
324962	5716654208414975603	2022-09-01	28.0	False	False
324963	5716654208414975603	2022-09-23	0.0	False	False
324964	5716654208414975603	2022-09-26	0.0	False	False
324965	5716654208414975603	2022-09-28	0.0	False	False
324966	5716654208414975603	2022-09-29	0.0	False	False
324967	5716654208414975603	2022-09-30	0.0	False	False
324968	5716654208414975603	2022-10-01	30.0	False	False
324969	5716654208414975603	2022-10-25	0.0	False	False
324970	5716654208414975603	2022-10-26	0.0	False	True
324971	5716654208414975603	2022-11-01	31.0	True	True
324972	5716654208414975603	2022-12-24	53.0	True	False
324973	5716654208414975603	2023-01-02	9.0	False	False
324974	5716654208414975603	2023-01-25	23.0	False	False
324975	5716654208414975603	2023-02-23	29.0	False	False
324976	5716654208414975603	2023-03-24	29.0	False	False
324977	5716654208414975603	2023-04-03	10.0	False	False
324978	5716654208414975603	2023-04-06	0.0	False	False
324979	5716654208414975603	2023-04-13	10.0	False	False
324980	5716654208414975603	2023-04-24	0.0	False	False
324981	5716654208414975603	2023-04-25	0.0	False	False
324982	5716654208414975603	2023-04-29	0.0	False	False
324983	5716654208414975603	2023-05-01	0.0	False	False
324984	5716654208414975603	2023-05-02	19.0	False	False
324985	5716654208414975603	2023-05-24	22.0	False	False
324986	5716654208414975603	2023-06-02	9.0	False	False
324987	5716654208414975603	2023-06-22	20.0	False	False

Case Overview¶

Objective¶

Dataset Overview¶

Methodology¶

Key Takeaways¶

EDA¶

Now we will look at the numerical columns more closely with some plots.¶

Feature Engineering¶

Derived Features¶

Feature Selection¶

Model Building¶

📊 Model Evaluation Report¶

✅ Strengths¶

❌ Limitations¶

🚀 Recommendations¶