FraFut
Nuovo Utente
- Messaggi
- 6
- Reazioni
- 0
- Punteggio
- 22
1. Complete the ClassificationPipeline class:Your entire program for the third exercise on Classification must fall under theClassificationPipeline class.2. Complete __init__():The bare minimum logic for the __init__ function is provided.In contrast to previous exercises, logic for asking the user the filepath to the dataset,loading that file, and then asking the user the target variable is included in this function.Therefore, once you initialize the object under the conditional statement if __name__ =“main”:, this function will be called and the aforesaid logic will be executed.Moreover, this function can include other initializations, but it all depends on the approachyou adopt to implement this exercise.3. Add the missing functions:You will notice that unlike previous exercises, an outline of the functions that will make upyour program is missing from this one. These specifics are not provided, but if you gothrough the practice exercise, and you must, you will know precisely what functions arerequired. A quick rundown of the logic you must add is as follows:• Preprocessing the dataset• Training your model using Decision Tree classification algorithm.• Training your model using Random Forest classification algorithm.• Computing the necessary metrics, at least Accuracy and AUC score, after everytraining and prediction.• Applying the different resampling techniques that you learned from the practiceexercise.• Option to train your model based on only the important features from the dataset.• Tuning hyperparameters using GridSearchCV.• Tuning hyperparameters using RandomizedSearchCV.• Training your model using Decision Tree classification algorithm but with only theimportant features.• Training your model using Random Forest classification algorithm but with only theimportant features.• Storing the metrics (Accuracy and AUC score) from every modelling variation, likesimple models, models based on different resampling techniques, hyperparametertuned models, models trained on only the important features.The best combination of Accuracy and AUC score must be at least 0.96. Meaning,your best model must have a score of at least 0.96, or else the test will fail. Sofigure out what combination of what variations is needed to get this score.4. Complete run_pipeline():This function will, mostly, call the other functions that you must implement, as indicated in#3.The two print statements already provided MUST REMAIN UNTOUCHED. These twostatements are vital for the successful testing of your program.5. OPTIONAL: PlottingIt is not mandatory that you include logic for plotting confusion matrix, ROC curve, and thedecision tree. You may do so at your discretion. ragazzi devo fare questo programma queastoè il codice che sono riuscito ad ottenere
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
class ClassificationPipeline:
def __init__(self, dataset_path, target_column):
"""Initialize the pipeline."""
self.dataset_path = dataset_path
self.df = pd.read_csv(self.dataset_path)
print("Dataset loaded successfully.")
print(self.df.head())
self.target_column = target_column
self.X = self.df.drop(columns=[self.target_column])
self.y = self.df[self.target_column]
self.preprocess()
self.results = {}
self.best_method = None
def preprocess(self):
"""Preprocess the dataset: encode categorical variables and split data."""
for col in self.X.select_dtypes(include=['object']).columns:
self.X[col] = LabelEncoder().fit_transform(self.X[col])
if self.y.dtype in ['float64', 'int64'] and len(np.unique(self.y)) > 10:
raise ValueError(f"Target column '{self.target_column}' seems to be continuous. Convert it to categorical!")
self.y = LabelEncoder().fit_transform(self.y)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)
print("Preprocessing complete.")
def train_model(self, model_name, model):
"""Train and evaluate a model."""
model.fit(self.X_train, self.y_train)
y_pred = model.predict(self.X_test)
if hasattr(model, "predict_proba"):
y_prob = model.predict_proba(self.X_test)
auc_score = roc_auc_score(self.y_test, y_prob[:, 1] if y_prob.shape[1] > 1 else y_prob)
else:
auc_score = 0.0
acc = accuracy_score(self.y_test, y_pred)
self.results[model_name] = {'accuracy': acc, 'auc': auc_score}
print(f"DEBUG - {model_name}: accuracy = {acc:.4f}, auc = {auc_score:.4f}")
if self.best_method is None or self.results[self.best_method]['auc'] < auc_score:
self.best_method = model_name
print(f"DEBUG - New best method: {self.best_method}")
def apply_resampling(self, method):
"""Apply different resampling techniques."""
samplers = {'SMOTE': SMOTE(random_state=42), 'SMOTETomek': SMOTETomek(random_state=42),
'SMOTEENN': SMOTEENN(random_state=42)}
if method in samplers:
self.X_train, self.y_train = samplers[method].fit_resample(self.X_train, self.y_train)
print(f"Applied {method} resampling.")
def hyperparameter_tuning(self):
"""Tune hyperparameters using GridSearchCV."""
param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy')
grid_search.fit(self.X_train, self.y_train)
print(f"Best Parameters: {grid_search.best_params_}")
def plot_results(self, model):
"""Plot ROC curve and Confusion Matrix."""
y_prob = model.predict_proba(self.X_test)
plt.figure(figsize=(8, 6))
if len(np.unique(self.y_test)) > 2:
y_test_bin = label_binarize(self.y_test, classes=np.unique(self.y_test))
for i in range(y_test_bin.shape[1]):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
plt.plot(fpr, tpr, label=f'Class {i} ROC Curve')
else:
fpr, tpr, _ = roc_curve(self.y_test, y_prob[:, 1])
plt.plot(fpr, tpr, label=f'{model.__class__.__name__} ROC Curve')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()
sns.heatmap(confusion_matrix(self.y_test, model.predict(self.X_test)), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()
def run_pipeline(self):
"""Run the full classification pipeline."""
dt_model = DecisionTreeClassifier(random_state=42)
self.train_model('Decision Tree', dt_model)
rf_model = RandomForestClassifier(random_state=42)
self.train_model('Random Forest', rf_model)
self.apply_resampling('SMOTE')
self.train_model('Decision Tree with SMOTE', DecisionTreeClassifier(random_state=42))
self.hyperparameter_tuning()
best_model = DecisionTreeClassifier(random_state=42)
best_model.fit(self.X_train, self.y_train)
"""DO NOT CHANGE THE FOLLOWING TWO LINES OF CODE. THEY ARE NEEDED TO TEST YOUR MODEL PERFORMANCE BY THE TEST SUITE."""
print(f"Best Accuracy Score: {self.results[self.best_method]['accuracy']:.4f}")
print(f"Best AUC Score: {self.results[self.best_method]['auc']:.4f}")
if __name__ == "__main__":
pipeline = ClassificationPipeline("test_data.csv", "diabetes")
pipeline.run_pipeline() ma non passa il test:
import pytest
import subprocess
import re
import matplotlib.pyplot as plt
THRESHOLD = 0.96
def extract_metric(output, metric_name):
match = re.search(fr"{metric_name}: ([0-9.]+)", output)
if match:
return float(match.group(1))
return None
@pytest.mark.parametrize("dataset_path, target_column", [("test_data.csv", "diabetes")])
def test_model_performance(mocker, dataset_path, target_column):
mocker.patch.object(plt, "show")
result = subprocess.run(
["python", "ex3_classification.py"],
text=True,
capture_output=True,
input=f"{dataset_path}\n{target_column}\n"
)
output = result.stdout
best_accuracy = extract_metric(output, "Best Accuracy Score")
best_auc = extract_metric(output, "Best AUC Score")
assert best_accuracy is not None, "Failed to extract Accuracy Score."
assert best_auc is not None, "Failed to extract AUC Score."
assert best_accuracy >= THRESHOLD, f"Accuracy {best_accuracy:.4f} is below {THRESHOLD}"
assert best_auc >= THRESHOLD, f"AUC {best_auc:.4f} is below {THRESHOLD}"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
class ClassificationPipeline:
def __init__(self, dataset_path, target_column):
"""Initialize the pipeline."""
self.dataset_path = dataset_path
self.df = pd.read_csv(self.dataset_path)
print("Dataset loaded successfully.")
print(self.df.head())
self.target_column = target_column
self.X = self.df.drop(columns=[self.target_column])
self.y = self.df[self.target_column]
self.preprocess()
self.results = {}
self.best_method = None
def preprocess(self):
"""Preprocess the dataset: encode categorical variables and split data."""
for col in self.X.select_dtypes(include=['object']).columns:
self.X[col] = LabelEncoder().fit_transform(self.X[col])
if self.y.dtype in ['float64', 'int64'] and len(np.unique(self.y)) > 10:
raise ValueError(f"Target column '{self.target_column}' seems to be continuous. Convert it to categorical!")
self.y = LabelEncoder().fit_transform(self.y)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)
print("Preprocessing complete.")
def train_model(self, model_name, model):
"""Train and evaluate a model."""
model.fit(self.X_train, self.y_train)
y_pred = model.predict(self.X_test)
if hasattr(model, "predict_proba"):
y_prob = model.predict_proba(self.X_test)
auc_score = roc_auc_score(self.y_test, y_prob[:, 1] if y_prob.shape[1] > 1 else y_prob)
else:
auc_score = 0.0
acc = accuracy_score(self.y_test, y_pred)
self.results[model_name] = {'accuracy': acc, 'auc': auc_score}
print(f"DEBUG - {model_name}: accuracy = {acc:.4f}, auc = {auc_score:.4f}")
if self.best_method is None or self.results[self.best_method]['auc'] < auc_score:
self.best_method = model_name
print(f"DEBUG - New best method: {self.best_method}")
def apply_resampling(self, method):
"""Apply different resampling techniques."""
samplers = {'SMOTE': SMOTE(random_state=42), 'SMOTETomek': SMOTETomek(random_state=42),
'SMOTEENN': SMOTEENN(random_state=42)}
if method in samplers:
self.X_train, self.y_train = samplers[method].fit_resample(self.X_train, self.y_train)
print(f"Applied {method} resampling.")
def hyperparameter_tuning(self):
"""Tune hyperparameters using GridSearchCV."""
param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy')
grid_search.fit(self.X_train, self.y_train)
print(f"Best Parameters: {grid_search.best_params_}")
def plot_results(self, model):
"""Plot ROC curve and Confusion Matrix."""
y_prob = model.predict_proba(self.X_test)
plt.figure(figsize=(8, 6))
if len(np.unique(self.y_test)) > 2:
y_test_bin = label_binarize(self.y_test, classes=np.unique(self.y_test))
for i in range(y_test_bin.shape[1]):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
plt.plot(fpr, tpr, label=f'Class {i} ROC Curve')
else:
fpr, tpr, _ = roc_curve(self.y_test, y_prob[:, 1])
plt.plot(fpr, tpr, label=f'{model.__class__.__name__} ROC Curve')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()
sns.heatmap(confusion_matrix(self.y_test, model.predict(self.X_test)), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()
def run_pipeline(self):
"""Run the full classification pipeline."""
dt_model = DecisionTreeClassifier(random_state=42)
self.train_model('Decision Tree', dt_model)
rf_model = RandomForestClassifier(random_state=42)
self.train_model('Random Forest', rf_model)
self.apply_resampling('SMOTE')
self.train_model('Decision Tree with SMOTE', DecisionTreeClassifier(random_state=42))
self.hyperparameter_tuning()
best_model = DecisionTreeClassifier(random_state=42)
best_model.fit(self.X_train, self.y_train)
"""DO NOT CHANGE THE FOLLOWING TWO LINES OF CODE. THEY ARE NEEDED TO TEST YOUR MODEL PERFORMANCE BY THE TEST SUITE."""
print(f"Best Accuracy Score: {self.results[self.best_method]['accuracy']:.4f}")
print(f"Best AUC Score: {self.results[self.best_method]['auc']:.4f}")
if __name__ == "__main__":
pipeline = ClassificationPipeline("test_data.csv", "diabetes")
pipeline.run_pipeline() ma non passa il test:
import pytest
import subprocess
import re
import matplotlib.pyplot as plt
THRESHOLD = 0.96
def extract_metric(output, metric_name):
match = re.search(fr"{metric_name}: ([0-9.]+)", output)
if match:
return float(match.group(1))
return None
@pytest.mark.parametrize("dataset_path, target_column", [("test_data.csv", "diabetes")])
def test_model_performance(mocker, dataset_path, target_column):
mocker.patch.object(plt, "show")
result = subprocess.run(
["python", "ex3_classification.py"],
text=True,
capture_output=True,
input=f"{dataset_path}\n{target_column}\n"
)
output = result.stdout
best_accuracy = extract_metric(output, "Best Accuracy Score")
best_auc = extract_metric(output, "Best AUC Score")
assert best_accuracy is not None, "Failed to extract Accuracy Score."
assert best_auc is not None, "Failed to extract AUC Score."
assert best_accuracy >= THRESHOLD, f"Accuracy {best_accuracy:.4f} is below {THRESHOLD}"
assert best_auc >= THRESHOLD, f"AUC {best_auc:.4f} is below {THRESHOLD}"