162 lines
7.3 KiB
Plaintext
162 lines
7.3 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "42259e56-f030-4f9a-b2f7-696736dc4439",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "NameError",
|
||
"evalue": "name 'StandardScaler' is not defined",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 10\u001b[39m y_real = y_real.astype(int)\n\u001b[32m 11\u001b[39m \n\u001b[32m 12\u001b[39m \u001b[38;5;66;03m# Разбиение (теперь функция импортирована и сработает без ошибок)\u001b[39;00m\n\u001b[32m 13\u001b[39m X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_real, y_real, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m scaler_r = StandardScaler()\n\u001b[32m 15\u001b[39m X_train_r_scaled = scaler_r.fit_transform(X_train_r)\n\u001b[32m 16\u001b[39m X_test_r_scaled = scaler_r.transform(X_test_r)\n\u001b[32m 17\u001b[39m \u001b[38;5;66;03m# Обучение с L1\u001b[39;00m\n",
|
||
"\u001b[31mNameError\u001b[39m: name 'StandardScaler' is not defined"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import openml\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"\n",
|
||
"# Загрузка датасета Spambase (ID 44)\n",
|
||
"dataset = openml.datasets.get_dataset(44)\n",
|
||
"X_real, y_real, _, _ = dataset.get_data(target=dataset.default_target_attribute)\n",
|
||
"\n",
|
||
"# Преобразование целевой переменной в числовой формат (0 и 1)\n",
|
||
"y_real = y_real.astype(int)\n",
|
||
"\n",
|
||
"# Разбиение (теперь функция импортирована и сработает без ошибок)\n",
|
||
"X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_real, y_real, test_size=0.2, random_state=42)\n",
|
||
"scaler_r = StandardScaler()\n",
|
||
"X_train_r_scaled = scaler_r.fit_transform(X_train_r)\n",
|
||
"X_test_r_scaled = scaler_r.transform(X_test_r)\n",
|
||
"# Обучение с L1\n",
|
||
"sgd_real_l1 = SGDClassifier(loss='log_loss', penalty='l1', alpha=0.01, random_state=42)\n",
|
||
"sgd_real_l1.fit(X_train_r_scaled, y_train_r)\n",
|
||
"\n",
|
||
"# Обучение с L2\n",
|
||
"sgd_real_l2 = SGDClassifier(loss='log_loss', penalty='l2', alpha=0.01, random_state=42)\n",
|
||
"sgd_real_l2.fit(X_train_r_scaled, y_train_r)\n",
|
||
"\n",
|
||
"# Точность моделей\n",
|
||
"acc_l1 = accuracy_score(y_test_r, sgd_real_l1.predict(X_test_r_scaled))\n",
|
||
"acc_l2 = accuracy_score(y_test_r, sgd_real_l2.predict(X_test_r_scaled))\n",
|
||
"\n",
|
||
"print(f\"Точность модели с L1-регуляризацией: {acc_l1:.4f}\")\n",
|
||
"print(f\"Точность модели с L2-регуляризацией: {acc_l2:.4f}\")\n",
|
||
"print(f\"Количество ненулевых весов (L1): {np.sum(sgd_real_l1.coef_ != 0)} из {X_real.shape[1]}\")\n",
|
||
"print(f\"Количество ненулевых весов (L2): {np.sum(sgd_real_l2.coef_ != 0)} из {X_real.shape[1]}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "c186361c-aa9b-4f87-8df5-34b2caff07a7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"--- Результаты L1 (Lasso) ---\n",
|
||
"Accuracy: 0.8686\n",
|
||
"Полезных признаков (вес != 0): 22 из 57\n",
|
||
"\n",
|
||
"--- Результаты L2 (Ridge) ---\n",
|
||
"Accuracy: 0.9055\n",
|
||
"Полезных признаков (вес != 0): 57 из 57\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import openml\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import SGDClassifier\n",
|
||
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||
"\n",
|
||
"# 1. Загрузка датасета Spambase (ID 44) — классификация спам-писем\n",
|
||
"dataset = openml.datasets.get_dataset(44)\n",
|
||
"X_real, y_real, _, _ = dataset.get_data(target=dataset.default_target_attribute)\n",
|
||
"\n",
|
||
"# Преобразование целевой переменной в числа (0 - не спам, 1 - спам)\n",
|
||
"y_real = y_real.astype(int)\n",
|
||
"\n",
|
||
"# 2. Разбиение на обучающую и тестовую выборки\n",
|
||
"X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(\n",
|
||
" X_real, y_real, test_size=0.2, random_state=42\n",
|
||
")\n",
|
||
"# Инициализация и обучение скалера\n",
|
||
"scaler_r = StandardScaler()\n",
|
||
"X_train_r_scaled = scaler_r.fit_transform(X_train_r)\n",
|
||
"X_test_r_scaled = scaler_r.transform(X_test_r)\n",
|
||
"# Параметры: loss='log_loss' делает из SGD логистическую регрессию\n",
|
||
"# alpha — сила регуляризации (чем выше, тем сильнее сжимаем веса)\n",
|
||
"\n",
|
||
"# Модель с L1 (Lasso) — должна занулить часть признаков\n",
|
||
"sgd_l1 = SGDClassifier(loss='log_loss', penalty='l1', alpha=0.02, random_state=42)\n",
|
||
"sgd_l1.fit(X_train_r_scaled, y_train_r)\n",
|
||
"\n",
|
||
"# Модель с L2 (Ridge) — просто уменьшает веса\n",
|
||
"sgd_l2 = SGDClassifier(loss='log_loss', penalty='l2', alpha=0.02, random_state=42)\n",
|
||
"sgd_l2.fit(X_train_r_scaled, y_train_r)\n",
|
||
"\n",
|
||
"# Предсказания\n",
|
||
"y_pred_l1 = sgd_l1.predict(X_test_r_scaled)\n",
|
||
"y_pred_l2 = sgd_l2.predict(X_test_r_scaled)\n",
|
||
"print(\"--- Результаты L1 (Lasso) ---\")\n",
|
||
"print(f\"Accuracy: {accuracy_score(y_test_r, y_pred_l1):.4f}\")\n",
|
||
"print(f\"Полезных признаков (вес != 0): {np.sum(sgd_l1.coef_ != 0)} из {X_real.shape[1]}\")\n",
|
||
"\n",
|
||
"print(\"\\n--- Результаты L2 (Ridge) ---\")\n",
|
||
"print(f\"Accuracy: {accuracy_score(y_test_r, y_pred_l2):.4f}\")\n",
|
||
"print(f\"Полезных признаков (вес != 0): {np.sum(sgd_l2.coef_ != 0)} из {X_real.shape[1]}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "52c23789-f6f5-4ef4-9efc-0dda66562e71",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "559b4093-337c-4897-9e51-45d8e24c1be3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|