419 lines
70 KiB
Plaintext
419 lines
70 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a353e94a-5fbe-4771-90ef-520ee626685d",
|
||
"metadata": {},
|
||
"source": [
|
||
"Повторим шаги из exd1.ipynb, но с ипользованием датасета News_Category_Dataset_v3 с сайта Kaggle.com"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "774e50b7-8c35-430b-b5e3-acfd2bfc9a18",
|
||
"metadata": {},
|
||
"source": [
|
||
"1) Подготовка данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "fb9195ff-69c3-4194-9f84-8d1698444e09",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"84381 документов - 5 категории\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import json\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from collections import defaultdict\n",
|
||
"from time import time\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer\n",
|
||
"from sklearn.decomposition import TruncatedSVD\n",
|
||
"from sklearn.pipeline import make_pipeline\n",
|
||
"from sklearn.preprocessing import Normalizer\n",
|
||
"from sklearn.cluster import KMeans, MiniBatchKMeans\n",
|
||
"from sklearn import metrics\n",
|
||
"\n",
|
||
"# Загрузка пользовательского датасета\n",
|
||
"with open(\"News_Category_Dataset_v3.json\", encoding=\"utf-8\") as f:\n",
|
||
" data = [json.loads(line) for line in f]\n",
|
||
"\n",
|
||
"df = pd.DataFrame(data)\n",
|
||
"\n",
|
||
"# Фильтрация только нужных столбцов\n",
|
||
"df = df[[\"category\", \"short_description\"]]\n",
|
||
"df = df[df[\"short_description\"].str.strip() != \"\"] # удалим пустые описания\n",
|
||
"\n",
|
||
"# Для примера ограничим количество категорий\n",
|
||
"top_categories = df[\"category\"].value_counts().nlargest(5).index.tolist()\n",
|
||
"df = df[df[\"category\"].isin(top_categories)]\n",
|
||
"\n",
|
||
"texts = df[\"short_description\"].tolist()\n",
|
||
"labels, label_names = pd.factorize(df[\"category\"])\n",
|
||
"true_k = len(label_names)\n",
|
||
"unique_labels, category_sizes = np.unique(labels, return_counts=True)\n",
|
||
"\n",
|
||
"print(f\"{len(texts)} документов - {true_k} категории\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "69934cda-68b2-42b3-a2fb-c6cc8473537c",
|
||
"metadata": {},
|
||
"source": [
|
||
"Создаём функцию fit_and_evaluate, которая обучает модель кластеризации несколько раз с разными random_state и вычисляет метрики"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "72107cf7-917b-42da-8ad1-a4e715903961",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"evaluations = []\n",
|
||
"evaluations_std = []\n",
|
||
"\n",
|
||
"def fit_and_evaluate(km, X, name=None, n_runs=5):\n",
|
||
" name = km.__class__.__name__ if name is None else name\n",
|
||
"\n",
|
||
" train_times = []\n",
|
||
" scores = defaultdict(list)\n",
|
||
" for seed in range(n_runs):\n",
|
||
" km.set_params(random_state=seed)\n",
|
||
" t0 = time()\n",
|
||
" km.fit(X)\n",
|
||
" train_times.append(time() - t0)\n",
|
||
" scores[\"Homogeneity\"].append(metrics.homogeneity_score(labels, km.labels_))\n",
|
||
" scores[\"Completeness\"].append(metrics.completeness_score(labels, km.labels_))\n",
|
||
" scores[\"V-measure\"].append(metrics.v_measure_score(labels, km.labels_))\n",
|
||
" scores[\"Adjusted Rand-Index\"].append(metrics.adjusted_rand_score(labels, km.labels_))\n",
|
||
" scores[\"Silhouette Coefficient\"].append(metrics.silhouette_score(X, km.labels_, sample_size=2000))\n",
|
||
"\n",
|
||
" train_times = np.asarray(train_times)\n",
|
||
"\n",
|
||
" print(f\"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s \")\n",
|
||
" evaluation = {\"estimator\": name, \"train_time\": train_times.mean()}\n",
|
||
" evaluation_std = {\"estimator\": name, \"train_time\": train_times.std()}\n",
|
||
" for score_name, score_values in scores.items():\n",
|
||
" mean_score, std_score = np.mean(score_values), np.std(score_values)\n",
|
||
" print(f\"{score_name}: {mean_score:.3f} ± {std_score:.3f}\")\n",
|
||
" evaluation[score_name] = mean_score\n",
|
||
" evaluation_std[score_name] = std_score\n",
|
||
"\n",
|
||
" evaluations.append(evaluation)\n",
|
||
" evaluations_std.append(evaluation_std)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c7cc318e-3f37-46e8-a33d-4588de702ffa",
|
||
"metadata": {},
|
||
"source": [
|
||
"Преобразуем текстовые данные в векторное представление"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "32c54c82-755f-4749-82fc-a03294872f48",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"vectorization done in 0.745 s\n",
|
||
"n_samples: 84381, n_features: 17006\n",
|
||
"0.001\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"vectorizer = TfidfVectorizer(max_df=0.5, min_df=5, stop_words=\"english\")\n",
|
||
"\n",
|
||
"t0 = time()\n",
|
||
"X_tfidf = vectorizer.fit_transform(texts)\n",
|
||
"\n",
|
||
"print(f\"vectorization done in {time() - t0:.3f} s\")\n",
|
||
"print(f\"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}\")\n",
|
||
"print(f\"{X_tfidf.nnz / np.prod(X_tfidf.shape):.3f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f37313d7-671f-4a5c-b021-f308a680343f",
|
||
"metadata": {},
|
||
"source": [
|
||
"Применяем KMeans для кластеризации текстов на основе TF-IDF векторов"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "347510a6-d160-4a55-a9dc-244f104d56cb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество документов в каждом кластере: [ 3116 3626 961 2587 74091]\n",
|
||
"Количество документов в каждом кластере: [ 22 81624 1261 1170 304]\n",
|
||
"Количество документов в каждом кластере: [ 3497 73711 961 1948 4264]\n",
|
||
"Количество документов в каждом кластере: [ 3385 961 67675 6706 5654]\n",
|
||
"Количество документов в каждом кластере: [ 961 3537 73170 2227 4486]\n",
|
||
"Настоящее распределение по категориям: [14774 32441 17943 9802 9421]\n",
|
||
"clustering done in 0.38 ± 0.02 s \n",
|
||
"Homogeneity: 0.036 ± 0.018\n",
|
||
"Completeness: 0.086 ± 0.036\n",
|
||
"V-measure: 0.051 ± 0.024\n",
|
||
"Adjusted Rand-Index: 0.002 ± 0.016\n",
|
||
"Silhouette Coefficient: 0.004 ± 0.001\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for seed in range(5):\n",
|
||
" kmeans = KMeans(n_clusters=true_k, max_iter=100, n_init=1, random_state=seed).fit(X_tfidf)\n",
|
||
" cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)\n",
|
||
" print(f\"Количество документов в каждом кластере: {cluster_sizes}\")\n",
|
||
"print(f\"Настоящее распределение по категориям: {category_sizes}\")\n",
|
||
"\n",
|
||
"# Повтор с лучшим результатом по инерции\n",
|
||
"kmeans = KMeans(n_clusters=true_k, max_iter=100, n_init=5)\n",
|
||
"fit_and_evaluate(kmeans, X_tfidf, name=\"KMeans\\non tf-idf vectors\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f8824341-eb9d-4509-a8e1-4ce0b37a206a",
|
||
"metadata": {},
|
||
"source": [
|
||
"Применяем LSA"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "adeb7956-6d2e-4efc-98d8-62e46d7ca32b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"LSA done in 1.497 s\n",
|
||
"Explained variance: 11.1%\n",
|
||
"clustering done in 0.15 ± 0.05 s \n",
|
||
"Homogeneity: 0.050 ± 0.028\n",
|
||
"Completeness: 0.068 ± 0.027\n",
|
||
"V-measure: 0.057 ± 0.029\n",
|
||
"Adjusted Rand-Index: 0.001 ± 0.010\n",
|
||
"Silhouette Coefficient: 0.027 ± 0.009\n",
|
||
"clustering done in 0.24 ± 0.03 s \n",
|
||
"Homogeneity: 0.096 ± 0.015\n",
|
||
"Completeness: 0.112 ± 0.019\n",
|
||
"V-measure: 0.104 ± 0.016\n",
|
||
"Adjusted Rand-Index: 0.041 ± 0.026\n",
|
||
"Silhouette Coefficient: 0.029 ± 0.002\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))\n",
|
||
"t0 = time()\n",
|
||
"X_lsa = lsa.fit_transform(X_tfidf)\n",
|
||
"explained_variance = lsa[0].explained_variance_ratio_.sum()\n",
|
||
"\n",
|
||
"print(f\"LSA done in {time() - t0:.3f} s\")\n",
|
||
"print(f\"Explained variance: {explained_variance * 100:.1f}%\")\n",
|
||
"\n",
|
||
"# Повтор кластеризации на пониженной размерности\n",
|
||
"kmeans = KMeans(n_clusters=true_k, max_iter=100, n_init=1)\n",
|
||
"fit_and_evaluate(kmeans, X_lsa, name=\"KMeans\\nwith LSA on tf-idf vectors\")\n",
|
||
"\n",
|
||
"minibatch_kmeans = MiniBatchKMeans(n_clusters=true_k, n_init=1, init_size=1000, batch_size=1000)\n",
|
||
"fit_and_evaluate(minibatch_kmeans, X_lsa, name=\"MiniBatchKMeans\\nwith LSA on tf-idf vectors\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "0fdd070b-0167-4ecc-9bec-ff038dddc61a",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определяем, какие слова лучше всего характеризуют каждый кластер"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "2e9617e3-f2d2-4527-9062-e6cdbaef5882",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Кластер 0: new time just like year life day world don ve \n",
|
||
"Кластер 1: people health help study national government americans public cancer sleep \n",
|
||
"Кластер 2: film video red night movie season summer singer home actor \n",
|
||
"Кластер 3: said president obama elect vice house people senator new secretary \n",
|
||
"Кластер 4: trump donald president administration campaign said house clinton republican election \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)\n",
|
||
"order_centroids = original_space_centroids.argsort()[:, ::-1]\n",
|
||
"terms = vectorizer.get_feature_names_out()\n",
|
||
"\n",
|
||
"for i in range(true_k):\n",
|
||
" print(f\"Кластер {i}:\", end=\" \")\n",
|
||
" for ind in order_centroids[i, :10]:\n",
|
||
" print(f\"{terms[ind]}\", end=\" \")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9ab1a8e4-b067-4ada-a966-1b312dbcfbcf",
|
||
"metadata": {},
|
||
"source": [
|
||
"Вместо TfidfVectorizer используем HashingVectorizer + TfidfTransformer + LSA"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "9be11eaf-9188-4c86-92e3-16e3e71b0d3b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"vectorization done in 2.805 s\n",
|
||
"clustering done in 0.14 ± 0.02 s \n",
|
||
"Homogeneity: 0.075 ± 0.044\n",
|
||
"Completeness: 0.090 ± 0.041\n",
|
||
"V-measure: 0.081 ± 0.043\n",
|
||
"Adjusted Rand-Index: 0.019 ± 0.022\n",
|
||
"Silhouette Coefficient: 0.029 ± 0.004\n",
|
||
"clustering done in 0.25 ± 0.03 s \n",
|
||
"Homogeneity: 0.086 ± 0.024\n",
|
||
"Completeness: 0.102 ± 0.024\n",
|
||
"V-measure: 0.093 ± 0.024\n",
|
||
"Adjusted Rand-Index: 0.036 ± 0.016\n",
|
||
"Silhouette Coefficient: 0.028 ± 0.004\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"lsa_vectorizer = make_pipeline(\n",
|
||
" HashingVectorizer(stop_words=\"english\", n_features=50000),\n",
|
||
" TfidfTransformer(),\n",
|
||
" TruncatedSVD(n_components=100, random_state=0),\n",
|
||
" Normalizer(copy=False),\n",
|
||
")\n",
|
||
"\n",
|
||
"t0 = time()\n",
|
||
"X_hashed_lsa = lsa_vectorizer.fit_transform(texts)\n",
|
||
"print(f\"vectorization done in {time() - t0:.3f} s\")\n",
|
||
"\n",
|
||
"fit_and_evaluate(kmeans, X_hashed_lsa, name=\"KMeans\\nwith LSA on hashed vectors\")\n",
|
||
"fit_and_evaluate(minibatch_kmeans, X_hashed_lsa, name=\"MiniBatchKMeans\\nwith LSA on hashed vectors\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9cc1b5c8-0bcc-4ad9-9d56-3a3400098451",
|
||
"metadata": {},
|
||
"source": [
|
||
"Строим таблицу и визуализацию с результатами кластеризации"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "903862f5-44bf-4cfe-8b97-3ed0d3f8a40b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1600x600 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(16, 6), sharey=True)\n",
|
||
"\n",
|
||
"df = pd.DataFrame(evaluations[::-1]).set_index(\"estimator\")\n",
|
||
"df_std = pd.DataFrame(evaluations_std[::-1]).set_index(\"estimator\")\n",
|
||
"\n",
|
||
"df.drop([\"train_time\"], axis=\"columns\").plot.barh(ax=ax0, xerr=df_std)\n",
|
||
"ax0.set_xlabel(\"Clustering scores\")\n",
|
||
"ax0.set_ylabel(\"\")\n",
|
||
"\n",
|
||
"df[\"train_time\"].plot.barh(ax=ax1, xerr=df_std[\"train_time\"])\n",
|
||
"ax1.set_xlabel(\"Clustering time (s)\")\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "6a88be40-ab21-4f56-8fb8-91636c05d1e7",
|
||
"metadata": {},
|
||
"source": [
|
||
"Модель успешно справилась с заданием, используя скачанный мной датасет, наиболее эффективным вновь оказался K-means с использованием LSA на хэшированных векторах, но стоит отметить что время кластеризации выросло, ввиду того что количество информации внутри скачанного датасета больше, чем в втроенном датасете fetch_20newsgroups"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "3c05ee8c-fc24-4a3c-bedd-180e53aace55",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|