Проект готов

This commit is contained in:
Максим Катков 2025-05-17 08:07:42 +03:00
parent da3e12f42e
commit 7cd2770d8d
5 changed files with 727 additions and 1 deletions

4
.gitignore vendored
View File

@ -1 +1,3 @@
.venv/
.venv/
data
.ipynb_checkpoints/

309
Untitled.ipynb Normal file

File diff suppressed because one or more lines are too long

237
Untitled1.ipynb Normal file

File diff suppressed because one or more lines are too long

108
plot_sgd_comparison copy.py Normal file
View File

@ -0,0 +1,108 @@
import torch
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import (
LogisticRegression,
PassiveAggressiveClassifier,
Perceptron,
SGDClassifier,
)
from sklearn.model_selection import train_test_split
# -----------------------------
# 1. Загрузка и подготовка данных
# -----------------------------
transform = transforms.Compose([
transforms.ToTensor(), # Преобразуем в тензор
lambda x: x.view(-1).numpy() # Преобразуем изображение в одномерный массив
])
# Загружаем EMNIST (Letters): содержит буквы A-Z
train_dataset = datasets.EMNIST(
root='./data', split='letters', train=True, download=True, transform=transform
)
test_dataset = datasets.EMNIST(
root='./data', split='letters', train=False, download=True, transform=transform
)
# Объединяем train и test
X_train = [x for x, y in train_dataset]
y_train = [y - 1 for x, y in train_dataset] # метки от 1 до 26 -> делаем 0..25
X_test = [x for x, y in test_dataset]
y_test = [y - 1 for x, y in test_dataset]
# Объединяем всё в один набор
X = np.array(X_train + X_test)
y = np.array(y_train + y_test)
# 🔍 Ограничиваем данные до ~1800 образцов (как в digits())
SAMPLE_LIMIT = 1800
X = X[:SAMPLE_LIMIT]
y = y[:SAMPLE_LIMIT]
print("Данные загружены:", X.shape, y.shape)
# -----------------------------
# 2. Настройка моделей
# -----------------------------
heldout = [0.95, 0.90, 0.75, 0.50, 0.01] # доли тестовой выборки
rounds = 10 # число повторений для усреднения
classifiers = [
("SGD", SGDClassifier(max_iter=110)),
("ASGD", SGDClassifier(max_iter=110, average=True)),
("Perceptron", Perceptron(max_iter=110)),
(
"Passive-Aggressive I",
PassiveAggressiveClassifier(max_iter=110, loss="hinge", C=1.0, tol=1e-4),
),
(
"Passive-Aggressive II",
PassiveAggressiveClassifier(
max_iter=110, loss="squared_hinge", C=1.0, tol=1e-4
),
),
(
"SAG",
LogisticRegression(max_iter=110, solver="sag", tol=1e-1, C=1.0e4 / X.shape[0]),
),
]
xx = 1.0 - np.array(heldout) # пропорция обучающей выборки
# -----------------------------
# 3. Обучение и оценка моделей
# -----------------------------
for name, clf in classifiers:
print(f"Обучение: {name}")
rng = np.random.RandomState(42)
yy = []
for test_size in heldout:
errors = []
for r in range(rounds):
X_train_part, X_test_part, y_train_part, y_test_part = train_test_split(
X, y, test_size=test_size, random_state=rng
)
clf.fit(X_train_part, y_train_part)
y_pred = clf.predict(X_test_part)
error_rate = 1 - np.mean(y_pred == y_test_part)
errors.append(error_rate)
yy.append(np.mean(errors))
plt.plot(xx, yy, label=name)
# -----------------------------
# 4. Визуализация результатов
# -----------------------------
plt.legend(loc="upper right")
plt.xlabel("Пропорция обучающей выборки")
plt.ylabel("Ошибка на тесте")
plt.title("Сравнение онлайн-алгоритмов на уменьшенном EMNIST Letters")
plt.grid(True)
plt.show()

70
plot_sgd_comparison.py Normal file
View File

@ -0,0 +1,70 @@
"""
==================================
Comparing various online solvers
==================================
An example showing how different online solvers perform
on the hand-written digits dataset.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.linear_model import (
LogisticRegression,
PassiveAggressiveClassifier,
Perceptron,
SGDClassifier,
)
from sklearn.model_selection import train_test_split
heldout = [0.95, 0.90, 0.75, 0.50, 0.01]
# Number of rounds to fit and evaluate an estimator.
rounds = 10
X, y = datasets.load_digits(return_X_y=True)
classifiers = [
("SGD", SGDClassifier(max_iter=110)),
("ASGD", SGDClassifier(max_iter=110, average=True)),
("Perceptron", Perceptron(max_iter=110)),
(
"Passive-Aggressive I",
PassiveAggressiveClassifier(max_iter=110, loss="hinge", C=1.0, tol=1e-4),
),
(
"Passive-Aggressive II",
PassiveAggressiveClassifier(
max_iter=110, loss="squared_hinge", C=1.0, tol=1e-4
),
),
(
"SAG",
LogisticRegression(max_iter=110, solver="sag", tol=1e-1, C=1.0e4 / X.shape[0]),
),
]
xx = 1.0 - np.array(heldout)
for name, clf in classifiers:
print("training %s" % name)
rng = np.random.RandomState(42)
yy = []
for i in heldout:
yy_ = []
for r in range(rounds):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=i, random_state=rng
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
yy_.append(1 - np.mean(y_pred == y_test))
yy.append(np.mean(yy_))
plt.plot(xx, yy, label=name)
plt.legend(loc="upper right")
plt.xlabel("Proportion train")
plt.ylabel("Test Error Rate")
plt.show()