"A well-designed data science machine learning project establishes a solid foundation for successful outcomes by carefully considering data quality, feature engineering, model selection, evaluation metrics, and deployment strategies."- Gemini 2024
Can we train a machine learning model to determine if an SMS message is spam?
This project uses the UCI SMS Spam Collection Dataset
The dataset contains 5,574 SMS messages labeled as either ham (legitimate) or spam.
ham
spam
import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Load the data df = pd.read_csv('spam-sms.tsv', sep="\t", names=["label", "text"]) # Load and explore the data print(df.head()) print(df.describe()) # Data cleaning df.dropna(inplace=True) # Feature extraction vectorizer = CountVectorizer() X = vectorizer.fit_transform(df['text']) y = df['label'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Initialize classifiers classifiers = { "Naive Bayes": MultinomialNB(), "Logistic Regression": LogisticRegression(random_state=42), "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42) } # Function to evaluate model def evaluate_model(y_true, y_pred): return { "Accuracy": accuracy_score(y_true, y_pred), "Precision": precision_score(y_true, y_pred, pos_label='spam'), "Recall": recall_score(y_true, y_pred, pos_label='spam'), "F1-score": f1_score(y_true, y_pred, pos_label='spam') } # Train and evaluate each classifier results = {} for name, clf in classifiers.items(): clf.fit(X_train, y_train) y_pred = clf.predict(X_test) results[name] = evaluate_model(y_test, y_pred) # Print results for name, metrics in results.items(): print(f"\nResults for {name}:") for metric, value in metrics.items(): print(f"{metric}: {value:.4f}") # Compare classifiers print("\nComparison of classifiers:") metrics = list(results[name].keys()) for metric in metrics: print(f"\n{metric}:") for name in classifiers.keys(): print(f"{name}: {results[name][metric]:.4f}") # Find the best classifier for each metric best_classifiers = {} for metric in metrics: best_classifier = max(results, key=lambda x: results[x][metric]) best_classifiers[metric] = best_classifier print("\nBest classifier for each metric:") for metric, classifier in best_classifiers.items(): print(f"{metric}: {classifier}")