{ "cells": [ { "cell_type": "markdown", "id": "32e6184a-41e1-4948-b870-f7bf309842eb", "metadata": {}, "source": [ "Загрузка и подготовка данных из CSV\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "4a781ce0-6c44-49d8-a129-acd160bfcdb2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Student_ID Age Gender Academic_Level Country Avg_Daily_Usage_Hours \\\n", "0 1 19 Female Undergraduate Bangladesh 5.2 \n", "1 2 22 Male Graduate India 2.1 \n", "2 3 20 Female Undergraduate USA 6.0 \n", "3 4 18 Male High School UK 3.0 \n", "4 5 21 Male Graduate Canada 4.5 \n", "\n", " Most_Used_Platform Affects_Academic_Performance Sleep_Hours_Per_Night \\\n", "0 Instagram Yes 6.5 \n", "1 Twitter No 7.5 \n", "2 TikTok Yes 5.0 \n", "3 YouTube No 7.0 \n", "4 Facebook Yes 6.0 \n", "\n", " Mental_Health_Score Relationship_Status Conflicts_Over_Social_Media \\\n", "0 6 In Relationship 3 \n", "1 8 Single 0 \n", "2 5 Complicated 4 \n", "3 7 Single 1 \n", "4 6 In Relationship 2 \n", "\n", " Addicted_Score \n", "0 8 \n", "1 3 \n", "2 9 \n", "3 4 \n", "4 7 \n", "\n", "RangeIndex: 705 entries, 0 to 704\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Student_ID 705 non-null int64 \n", " 1 Age 705 non-null int64 \n", " 2 Gender 705 non-null object \n", " 3 Academic_Level 705 non-null object \n", " 4 Country 705 non-null object \n", " 5 Avg_Daily_Usage_Hours 705 non-null float64\n", " 6 Most_Used_Platform 705 non-null object \n", " 7 Affects_Academic_Performance 705 non-null object \n", " 8 Sleep_Hours_Per_Night 705 non-null float64\n", " 9 Mental_Health_Score 705 non-null int64 \n", " 10 Relationship_Status 705 non-null object \n", " 11 Conflicts_Over_Social_Media 705 non-null int64 \n", " 12 Addicted_Score 705 non-null int64 \n", "dtypes: float64(2), int64(5), object(6)\n", "memory usage: 71.7+ KB\n", "None\n", "Train shape: (564, 11), Test shape: (141, 11)\n", "Class distribution in train:\n", "Addicted_Label\n", "1 0.664894\n", "0 0.335106\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "# Загрузка данных из CSV\n", "df = pd.read_csv(\"Students Social Media Addiction.csv\")\n", "\n", "# Быстрый осмотр\n", "print(df.head())\n", "print(df.info())\n", "\n", "# Преобразуем категориальные переменные в числовые с помощью LabelEncoder\n", "label_encoders = {}\n", "categorical_cols = [\"Gender\", \"Academic_Level\", \"Country\", \"Most_Used_Platform\", \"Relationship_Status\", \"Affects_Academic_Performance\"]\n", "\n", "for col in categorical_cols:\n", " le = LabelEncoder()\n", " df[col] = le.fit_transform(df[col])\n", " label_encoders[col] = le\n", "\n", "# Целевая переменная: Addicted_Score (например, можно классифицировать как 'high' или 'low')\n", "# Для простоты создадим бинарную цель: Addicted_Score >= 6 - \"Addicted\", иначе \"Not Addicted\"\n", "df[\"Addicted_Label\"] = (df[\"Addicted_Score\"] >= 6).astype(int)\n", "\n", "# Выделяем признаки и целевую переменную\n", "X = df.drop(columns=[\"Student_ID\", \"Addicted_Score\", \"Addicted_Label\"])\n", "y = df[\"Addicted_Label\"]\n", "\n", "# Делим на train и test\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", "\n", "print(f\"Train shape: {X_train.shape}, Test shape: {X_test.shape}\")\n", "print(f\"Class distribution in train:\\n{y_train.value_counts(normalize=True)}\")\n" ] }, { "cell_type": "markdown", "id": "6ea74112-ba53-45c0-8ede-49155e903890", "metadata": {}, "source": [ "Обучение модели RidgeClassifier" ] }, { "cell_type": "code", "execution_count": 3, "id": "d03ccd19-aba5-4fa9-b9a8-13e24039220e", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import RidgeClassifier\n", "\n", "clf = RidgeClassifier(tol=1e-2, solver=\"sparse_cg\")\n", "clf.fit(X_train, y_train)\n", "\n", "pred = clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 4, "id": "9c85dc8e-1430-4a1f-8b2a-465798ea0f1b", "metadata": {}, "outputs": [], "source": [ "from sklearn import metrics\n", "from sklearn.utils.extmath import density\n", "from time import time\n", "\n", "def benchmark(clf, X_train, y_train, X_test, y_test, custom_name=False):\n", " print(\"_\" * 80)\n", " print(\"Training: \")\n", " print(clf)\n", " t0 = time()\n", " clf.fit(X_train, y_train)\n", " train_time = time() - t0\n", " print(f\"train time: {train_time:.3f}s\")\n", "\n", " t0 = time()\n", " pred = clf.predict(X_test)\n", " test_time = time() - t0\n", " print(f\"test time: {test_time:.3f}s\")\n", "\n", " score = metrics.accuracy_score(y_test, pred)\n", " print(f\"accuracy: {score:.3f}\")\n", "\n", " if hasattr(clf, \"coef_\"):\n", " if len(clf.coef_.shape) == 2:\n", " n_features = clf.coef_.shape[1]\n", " else:\n", " n_features = clf.coef_.shape[0]\n", " print(f\"dimensionality: {n_features}\")\n", " print(f\"density: {density(clf.coef_)}\")\n", " print()\n", "\n", " print()\n", " clf_descr = str(custom_name) if custom_name else clf.__class__.__name__\n", " return clf_descr, score, train_time, test_time\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "79250597-7639-41fb-8c70-d87d58afe8ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "LogisticRegression(C=5, max_iter=1000)\n", "train time: 0.052s\n", "test time: 0.001s\n", "accuracy: 0.965\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "RidgeClassifier(solver='sparse_cg')\n", "train time: 0.001s\n", "test time: 0.000s\n", "accuracy: 0.965\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "KNeighborsClassifier(n_neighbors=100)\n", "train time: 0.001s\n", "test time: 0.004s\n", "accuracy: 0.716\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "RandomForestClassifier()\n", "train time: 0.060s\n", "test time: 0.003s\n", "accuracy: 0.979\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "LinearSVC(C=0.1, dual=False)\n", "train time: 0.001s\n", "test time: 0.000s\n", "accuracy: 0.965\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "SGDClassifier(early_stopping=True, loss='log_loss', n_iter_no_change=3)\n", "train time: 0.002s\n", "test time: 0.000s\n", "accuracy: 0.936\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "NearestCentroid()\n", "train time: 0.001s\n", "test time: 0.001s\n", "accuracy: 0.539\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "ComplementNB(alpha=0.1)\n", "train time: 0.001s\n", "test time: 0.000s\n", "accuracy: 0.837\n", "\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier\n", "from sklearn.neighbors import KNeighborsClassifier, NearestCentroid\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.svm import LinearSVC\n", "from sklearn.naive_bayes import ComplementNB\n", "\n", "classifiers = [\n", " (LogisticRegression(C=5, max_iter=1000), \"Logistic Regression\"),\n", " (RidgeClassifier(alpha=1.0, solver=\"sparse_cg\"), \"Ridge Classifier\"),\n", " (KNeighborsClassifier(n_neighbors=100), \"kNN\"),\n", " (RandomForestClassifier(), \"Random Forest\"),\n", " (LinearSVC(C=0.1, dual=False, max_iter=1000), \"Linear SVC\"),\n", " (SGDClassifier(loss=\"log_loss\", alpha=1e-4, n_iter_no_change=3, early_stopping=True), \"log-loss SGD\"),\n", " (NearestCentroid(), \"NearestCentroid\"),\n", " (ComplementNB(alpha=0.1), \"Complement naive Bayes\"),\n", "]\n", "\n", "results = []\n", "for clf, name in classifiers:\n", " print(\"=\" * 80)\n", " results.append(benchmark(clf, X_train, y_train, X_test, y_test, name))\n" ] }, { "cell_type": "markdown", "id": "41412766-f5a2-4ca7-aa6c-18f54484b0be", "metadata": {}, "source": [ "Визуализация результатов" ] }, { "cell_type": "code", "execution_count": 8, "id": "c75ef28c-88d4-489c-9b23-8b9307f106a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "LogisticRegression(max_iter=1000)\n", "train time: 0.043s\n", "test time: 0.001s\n", "accuracy: 0.965\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "RidgeClassifier(solver='sparse_cg')\n", "train time: 0.001s\n", "test time: 0.000s\n", "accuracy: 0.965\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "KNeighborsClassifier(n_neighbors=10)\n", "train time: 0.001s\n", "test time: 0.003s\n", "accuracy: 0.908\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "RandomForestClassifier()\n", "train time: 0.061s\n", "test time: 0.003s\n", "accuracy: 0.979\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "LinearSVC()\n", "train time: 0.001s\n", "test time: 0.000s\n", "accuracy: 0.965\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "SGDClassifier(early_stopping=True, loss='log_loss', n_iter_no_change=3)\n", "train time: 0.002s\n", "test time: 0.000s\n", "accuracy: 0.752\n", "dimensionality: 11\n", "density: 1.0\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "NearestCentroid()\n", "train time: 0.001s\n", "test time: 0.001s\n", "accuracy: 0.539\n", "\n", "================================================================================\n", "________________________________________________________________________________\n", "Training: \n", "ComplementNB(alpha=0.1)\n", "train time: 0.001s\n", "test time: 0.000s\n", "accuracy: 0.837\n", "\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from sklearn import metrics\n", "from sklearn.utils.extmath import density\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression, SGDClassifier\n", "from sklearn.naive_bayes import ComplementNB\n", "from sklearn.neighbors import KNeighborsClassifier, NearestCentroid\n", "from sklearn.svm import LinearSVC\n", "from time import time\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "def benchmark(clf, X_train, y_train, X_test, y_test, custom_name=False):\n", " print(\"_\" * 80)\n", " print(\"Training: \")\n", " print(clf)\n", " t0 = time()\n", " clf.fit(X_train, y_train)\n", " train_time = time() - t0\n", " print(f\"train time: {train_time:.3f}s\")\n", "\n", " t0 = time()\n", " pred = clf.predict(X_test)\n", " test_time = time() - t0\n", " print(f\"test time: {test_time:.3f}s\")\n", "\n", " score = metrics.accuracy_score(y_test, pred)\n", " print(f\"accuracy: {score:.3f}\")\n", "\n", " if hasattr(clf, \"coef_\"):\n", " coef_shape = clf.coef_.shape\n", " if len(coef_shape) == 1:\n", " print(f\"dimensionality: {coef_shape[0]}\")\n", " else:\n", " print(f\"dimensionality: {coef_shape[1]}\")\n", " print(f\"density: {density(clf.coef_)}\")\n", "\n", " print()\n", " clf_descr = str(custom_name) if custom_name else clf.__class__.__name__\n", " return clf_descr, score, train_time, test_time\n", "\n", "\n", "results = []\n", "classifiers = [\n", " (LogisticRegression(max_iter=1000), \"Logistic Regression\"),\n", " (RidgeClassifier(alpha=1.0, solver=\"sparse_cg\"), \"Ridge Classifier\"),\n", " (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n", " (RandomForestClassifier(), \"Random Forest\"),\n", " (LinearSVC(max_iter=1000), \"Linear SVC\"),\n", " (SGDClassifier(loss=\"log_loss\", alpha=1e-4, n_iter_no_change=3, early_stopping=True), \"log-loss SGD\"),\n", " (NearestCentroid(), \"NearestCentroid\"),\n", " (ComplementNB(alpha=0.1), \"Complement naive Bayes\"),\n", "]\n", "\n", "for clf, name in classifiers:\n", " print(\"=\" * 80)\n", " results.append(benchmark(clf, X_train, y_train, X_test, y_test, name))\n", "\n", "# Визуализация результатов\n", "results_arr = np.array(results)\n", "clf_names, scores, train_times, test_times = results_arr.T\n", "\n", "fig, ax1 = plt.subplots(figsize=(10, 8))\n", "ax1.scatter(scores.astype(float), train_times.astype(float), s=60)\n", "ax1.set(title=\"Score-training time trade-off\", yscale=\"log\", xlabel=\"Test accuracy\", ylabel=\"Training time (s)\")\n", "\n", "fig, ax2 = plt.subplots(figsize=(10, 8))\n", "ax2.scatter(scores.astype(float), test_times.astype(float), s=60)\n", "ax2.set(title=\"Score-test time trade-off\", yscale=\"log\", xlabel=\"Test accuracy\", ylabel=\"Test time (s)\")\n", "\n", "for i, txt in enumerate(clf_names):\n", " ax1.annotate(txt, (float(scores[i]), float(train_times[i])))\n", " ax2.annotate(txt, (float(scores[i]), float(test_times[i])))\n" ] }, { "cell_type": "markdown", "id": "35fb3149-bae9-46b0-b64e-bf6e319b68fe", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.3" } }, "nbformat": 4, "nbformat_minor": 5 }