4laba/4444/Untitled.ipynb

162 lines
7.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "42259e56-f030-4f9a-b2f7-696736dc4439",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'StandardScaler' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 10\u001b[39m y_real = y_real.astype(int)\n\u001b[32m 11\u001b[39m \n\u001b[32m 12\u001b[39m \u001b[38;5;66;03m# Разбиение (теперь функция импортирована и сработает без ошибок)\u001b[39;00m\n\u001b[32m 13\u001b[39m X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_real, y_real, test_size=\u001b[32m0.2\u001b[39m, random_state=\u001b[32m42\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m scaler_r = StandardScaler()\n\u001b[32m 15\u001b[39m X_train_r_scaled = scaler_r.fit_transform(X_train_r)\n\u001b[32m 16\u001b[39m X_test_r_scaled = scaler_r.transform(X_test_r)\n\u001b[32m 17\u001b[39m \u001b[38;5;66;03m# Обучение с L1\u001b[39;00m\n",
"\u001b[31mNameError\u001b[39m: name 'StandardScaler' is not defined"
]
}
],
"source": [
"import openml\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"# Загрузка датасета Spambase (ID 44)\n",
"dataset = openml.datasets.get_dataset(44)\n",
"X_real, y_real, _, _ = dataset.get_data(target=dataset.default_target_attribute)\n",
"\n",
"# Преобразование целевой переменной в числовой формат (0 и 1)\n",
"y_real = y_real.astype(int)\n",
"\n",
"# Разбиение (теперь функция импортирована и сработает без ошибок)\n",
"X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_real, y_real, test_size=0.2, random_state=42)\n",
"scaler_r = StandardScaler()\n",
"X_train_r_scaled = scaler_r.fit_transform(X_train_r)\n",
"X_test_r_scaled = scaler_r.transform(X_test_r)\n",
"# Обучение с L1\n",
"sgd_real_l1 = SGDClassifier(loss='log_loss', penalty='l1', alpha=0.01, random_state=42)\n",
"sgd_real_l1.fit(X_train_r_scaled, y_train_r)\n",
"\n",
"# Обучение с L2\n",
"sgd_real_l2 = SGDClassifier(loss='log_loss', penalty='l2', alpha=0.01, random_state=42)\n",
"sgd_real_l2.fit(X_train_r_scaled, y_train_r)\n",
"\n",
"# Точность моделей\n",
"acc_l1 = accuracy_score(y_test_r, sgd_real_l1.predict(X_test_r_scaled))\n",
"acc_l2 = accuracy_score(y_test_r, sgd_real_l2.predict(X_test_r_scaled))\n",
"\n",
"print(f\"Точность модели с L1-регуляризацией: {acc_l1:.4f}\")\n",
"print(f\"Точность модели с L2-регуляризацией: {acc_l2:.4f}\")\n",
"print(f\"Количество ненулевых весов (L1): {np.sum(sgd_real_l1.coef_ != 0)} из {X_real.shape[1]}\")\n",
"print(f\"Количество ненулевых весов (L2): {np.sum(sgd_real_l2.coef_ != 0)} из {X_real.shape[1]}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c186361c-aa9b-4f87-8df5-34b2caff07a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- Результаты L1 (Lasso) ---\n",
"Accuracy: 0.8686\n",
"Полезных признаков (вес != 0): 22 из 57\n",
"\n",
"--- Результаты L2 (Ridge) ---\n",
"Accuracy: 0.9055\n",
"Полезных признаков (вес != 0): 57 из 57\n"
]
}
],
"source": [
"import openml\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"# 1. Загрузка датасета Spambase (ID 44) — классификация спам-писем\n",
"dataset = openml.datasets.get_dataset(44)\n",
"X_real, y_real, _, _ = dataset.get_data(target=dataset.default_target_attribute)\n",
"\n",
"# Преобразование целевой переменной в числа (0 - не спам, 1 - спам)\n",
"y_real = y_real.astype(int)\n",
"\n",
"# 2. Разбиение на обучающую и тестовую выборки\n",
"X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(\n",
" X_real, y_real, test_size=0.2, random_state=42\n",
")\n",
"# Инициализация и обучение скалера\n",
"scaler_r = StandardScaler()\n",
"X_train_r_scaled = scaler_r.fit_transform(X_train_r)\n",
"X_test_r_scaled = scaler_r.transform(X_test_r)\n",
"# Параметры: loss='log_loss' делает из SGD логистическую регрессию\n",
"# alpha — сила регуляризации (чем выше, тем сильнее сжимаем веса)\n",
"\n",
"# Модель с L1 (Lasso) — должна занулить часть признаков\n",
"sgd_l1 = SGDClassifier(loss='log_loss', penalty='l1', alpha=0.02, random_state=42)\n",
"sgd_l1.fit(X_train_r_scaled, y_train_r)\n",
"\n",
"# Модель с L2 (Ridge) — просто уменьшает веса\n",
"sgd_l2 = SGDClassifier(loss='log_loss', penalty='l2', alpha=0.02, random_state=42)\n",
"sgd_l2.fit(X_train_r_scaled, y_train_r)\n",
"\n",
"# Предсказания\n",
"y_pred_l1 = sgd_l1.predict(X_test_r_scaled)\n",
"y_pred_l2 = sgd_l2.predict(X_test_r_scaled)\n",
"print(\"--- Результаты L1 (Lasso) ---\")\n",
"print(f\"Accuracy: {accuracy_score(y_test_r, y_pred_l1):.4f}\")\n",
"print(f\"Полезных признаков (вес != 0): {np.sum(sgd_l1.coef_ != 0)} из {X_real.shape[1]}\")\n",
"\n",
"print(\"\\n--- Результаты L2 (Ridge) ---\")\n",
"print(f\"Accuracy: {accuracy_score(y_test_r, y_pred_l2):.4f}\")\n",
"print(f\"Полезных признаков (вес != 0): {np.sum(sgd_l2.coef_ != 0)} из {X_real.shape[1]}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52c23789-f6f5-4ef4-9efc-0dda66562e71",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "559b4093-337c-4897-9e51-45d8e24c1be3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}