773 lines
15 KiB
Python
773 lines
15 KiB
Python
#!/usr/bin/env python
|
||
# coding: utf-8
|
||
|
||
# ### Загрузка исходной базы данных
|
||
|
||
# In[1]:
|
||
|
||
|
||
import pandas as pd
|
||
|
||
df = pd.read_csv('База_данных_отзывы.csv', delimiter=';',header=None, usecols=[10]) # выбираем нужные столбцы
|
||
df.columns = ['Отзывы']
|
||
documents = df['Отзывы'].dropna().tolist()
|
||
print(documents)
|
||
|
||
|
||
# In[2]:
|
||
|
||
|
||
documents.pop(0)
|
||
print(documents[0])
|
||
|
||
|
||
# ### Предобработка данных
|
||
|
||
# In[3]:
|
||
|
||
|
||
import nltk
|
||
from nltk.corpus import stopwords
|
||
from nltk.tokenize import word_tokenize
|
||
import string
|
||
|
||
# Загрузка стоп-слов и пунктуации
|
||
nltk.download('stopwords')
|
||
nltk.download('punkt')
|
||
stop_words = set(stopwords.words('russian'))
|
||
punctuation = set(string.punctuation)
|
||
|
||
preprocessed_text = []
|
||
|
||
def preprocess_text(text):
|
||
words = word_tokenize(text.lower()) # Привести к нижнему регистру и токенизировать
|
||
filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
|
||
return " ".join(filtered_words)
|
||
|
||
# Пример предобработки текстовых данных
|
||
for i in documents:
|
||
text = i
|
||
preprocessed_text.append(preprocess_text(text))
|
||
print(preprocessed_text[10])
|
||
|
||
|
||
# In[4]:
|
||
|
||
|
||
import spacy
|
||
from spacy import load
|
||
from spacy.lang.ru.examples import sentences
|
||
from spacy.lang.ru import Russian
|
||
|
||
preprocessed_lemmatized_text = []
|
||
|
||
# Загрузка языковой модели spaCy
|
||
nlp = spacy.load("ru_core_news_md")
|
||
|
||
def preprocess_and_lemmatize(text):
|
||
doc = nlp(text.lower()) # Привести к нижнему регистру и лемматизировать
|
||
lemmatized_words = [token.lemma_ for token in doc if token.text not in punctuation and token.text not in stop_words]
|
||
return " ".join(lemmatized_words)
|
||
|
||
# Пример предобработки и лемматизации текста
|
||
for word in preprocessed_text:
|
||
text = word
|
||
preprocessed_lemmatized_text.append(preprocess_and_lemmatize(text))
|
||
print(preprocessed_lemmatized_text[10])
|
||
|
||
|
||
# In[6]:
|
||
|
||
|
||
print(len(preprocessed_lemmatized_text))
|
||
|
||
|
||
# In[17]:
|
||
|
||
|
||
print(preprocessed_lemmatized_text[940:950])
|
||
|
||
|
||
# In[37]:
|
||
|
||
|
||
import csv
|
||
with open('out.csv', 'w', encoding="utf-8", newline='') as file:
|
||
writer = csv.writer(file)
|
||
writer.writerow(preprocessed_lemmatized_text) #сохранение в csv файл лемматизированные отзывы
|
||
|
||
|
||
# ### Формирование TF-IDF матрицы ключевых слов
|
||
|
||
# In[5]:
|
||
|
||
|
||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
import pandas as pd
|
||
|
||
# Пример текстовых данных
|
||
|
||
|
||
# Создание объекта TfidfVectorizer
|
||
tfidf_vectorizer = TfidfVectorizer()
|
||
|
||
# Применение TF-IDF к текстовым данным
|
||
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_lemmatized_text)
|
||
|
||
# Получение списка ключевых слов и их значения TF-IDF для первого документа
|
||
feature_names = tfidf_vectorizer.get_feature_names()
|
||
tfidf_scores = tfidf_matrix.toarray()[0]
|
||
|
||
# Сортировка слов по значениям TF-IDF
|
||
sorted_keywords = [word for _, word in sorted(zip(tfidf_scores, feature_names), reverse=True)]
|
||
|
||
print("Ключевые слова:", sorted_keywords)
|
||
|
||
|
||
# In[9]:
|
||
|
||
|
||
tfidf_matrix.shape
|
||
|
||
|
||
# In[9]:
|
||
|
||
|
||
print(tfidf_matrix)
|
||
|
||
|
||
# ### Преобразование столбца "Ценности" в числовой список для классификатора
|
||
|
||
# In[23]:
|
||
|
||
|
||
tfidf_matrix.shape
|
||
|
||
|
||
# In[24]:
|
||
|
||
|
||
tsvd_data2D.shape
|
||
|
||
|
||
# In[6]:
|
||
|
||
|
||
import pandas as pd
|
||
|
||
df = pd.read_csv('База_данных_отзывы.csv', delimiter=';',header=0, usecols=[10,17]) # выбираем нужные столбцы
|
||
|
||
|
||
# In[7]:
|
||
|
||
|
||
df = df.dropna()
|
||
|
||
|
||
# In[8]:
|
||
|
||
|
||
df["Ценности"] = df["Ценности"].replace({'релевантно':1, 'не определено':0 })
|
||
|
||
|
||
# In[9]:
|
||
|
||
|
||
values = df['Ценности'].tolist()
|
||
|
||
|
||
# In[16]:
|
||
|
||
|
||
print(values[940:950])
|
||
|
||
|
||
# In[38]:
|
||
|
||
|
||
df["Ценности"].sum()
|
||
|
||
|
||
# ### Классификатор случайного леса
|
||
|
||
# #### Использование матрицы TF-IDF обычной размерности
|
||
|
||
# In[10]:
|
||
|
||
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
x = tfidf_matrix
|
||
y = values
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
|
||
|
||
|
||
# In[11]:
|
||
|
||
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
|
||
clf = RandomForestClassifier(n_estimators=3000,max_depth=2)
|
||
clf.fit(X_train, y_train)
|
||
y_pred = clf.predict(X_test)
|
||
|
||
|
||
# In[13]:
|
||
|
||
|
||
from sklearn import metrics
|
||
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
||
|
||
|
||
# In[12]:
|
||
|
||
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
print(classification_report(y_test, y_pred))
|
||
|
||
|
||
# In[36]:
|
||
|
||
|
||
import numpy as np
|
||
np.array(y_test).sum()
|
||
|
||
|
||
# In[41]:
|
||
|
||
|
||
y_pred.shape
|
||
|
||
|
||
# In[43]:
|
||
|
||
|
||
y_pred
|
||
|
||
|
||
# In[42]:
|
||
|
||
|
||
y_pred.sum()
|
||
|
||
|
||
# In[22]:
|
||
|
||
|
||
tsvd_data2D.shape
|
||
|
||
|
||
# #### Использование матрицы TF-IDF пониженной размерности
|
||
|
||
# In[14]:
|
||
|
||
|
||
from sklearn.decomposition import TruncatedSVD
|
||
## Понижение размерности
|
||
tsvd2D = TruncatedSVD(n_components=1024)
|
||
tsvd_data2D = tsvd2D.fit_transform(tfidf_matrix)
|
||
|
||
|
||
# In[15]:
|
||
|
||
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
x = tsvd_data2D
|
||
y = values
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
|
||
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
|
||
clf = RandomForestClassifier(n_estimators=2000,max_depth=2)
|
||
clf.fit(X_train, y_train)
|
||
y_pred = clf.predict(X_test)
|
||
|
||
|
||
# In[17]:
|
||
|
||
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
print(classification_report(y_test, y_pred))
|
||
|
||
|
||
# In[16]:
|
||
|
||
|
||
from sklearn import metrics
|
||
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
||
|
||
|
||
# In[65]:
|
||
|
||
|
||
y_pred.sum()
|
||
|
||
|
||
# ### Метод K-ближайших соседей
|
||
|
||
# #### Использование матрицы TF-IDF обычной размерности
|
||
|
||
# In[221]:
|
||
|
||
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
x = tfidf_matrix
|
||
y = values
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
|
||
|
||
|
||
# In[222]:
|
||
|
||
|
||
from sklearn.neighbors import KNeighborsClassifier
|
||
|
||
model = KNeighborsClassifier(n_neighbors = 1024)
|
||
|
||
model.fit(X_train, y_train)
|
||
y_pred = model.predict(X_test)
|
||
|
||
|
||
# In[224]:
|
||
|
||
|
||
|
||
from sklearn import metrics
|
||
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
||
print(sum(y_pred), y_pred.shape)
|
||
|
||
|
||
# In[215]:
|
||
|
||
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
print(classification_report(y_test, y_pred))
|
||
|
||
|
||
# In[223]:
|
||
|
||
|
||
print(sum(y_pred), y_pred.shape)
|
||
|
||
|
||
# #### Использование матрицы TF-IDF пониженной размерности
|
||
|
||
# In[25]:
|
||
|
||
|
||
# возьмем в качестве X матрицу пониженной размерности
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
x = tsvd_data2D
|
||
y = values
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
|
||
|
||
from sklearn.neighbors import KNeighborsClassifier
|
||
|
||
model = KNeighborsClassifier(n_neighbors = 1024)
|
||
|
||
model.fit(X_train, y_train)
|
||
y_pred = model.predict(X_test)
|
||
|
||
|
||
# In[226]:
|
||
|
||
|
||
|
||
from sklearn import metrics
|
||
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
||
print(sum(y_pred), y_pred.shape)
|
||
|
||
|
||
# In[227]:
|
||
|
||
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
print(classification_report(y_test, y_pred))
|
||
|
||
|
||
# ### Логистическая регрессия
|
||
|
||
# #### Использование матрицы TF-IDF обычной размерности
|
||
|
||
# In[232]:
|
||
|
||
|
||
from sklearn.datasets import load_iris
|
||
from sklearn.linear_model import LogisticRegression
|
||
x = tfidf_matrix
|
||
y = values
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=90)
|
||
clf = LogisticRegression().fit(X_train, y_train)
|
||
y_pred = clf.predict(X_test)
|
||
|
||
from sklearn import metrics
|
||
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
||
print(sum(y_pred), y_pred.shape)
|
||
|
||
|
||
# In[233]:
|
||
|
||
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
print(classification_report(y_test, y_pred))
|
||
|
||
|
||
# #### Использование матрицы TF-IDF пониженной размерности
|
||
|
||
# In[234]:
|
||
|
||
|
||
## возьмем в качестве X матрицу пониженной размерности
|
||
from sklearn.datasets import load_iris
|
||
from sklearn.linear_model import LogisticRegression
|
||
x = tsvd_data2D
|
||
y = values
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=90)
|
||
clf = LogisticRegression().fit(X_train, y_train)
|
||
y_pred = clf.predict(X_test)
|
||
|
||
from sklearn import metrics
|
||
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
||
print(sum(y_pred), y_pred.shape)
|
||
|
||
|
||
# In[235]:
|
||
|
||
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
print(classification_report(y_test, y_pred))
|
||
|
||
|
||
# ### Тематическое моделирование
|
||
|
||
# #### Неотрицательная матричная факторизация (NMF)
|
||
|
||
# In[19]:
|
||
|
||
|
||
from sklearn.decomposition import NMF
|
||
import csv
|
||
# Подготовка данных
|
||
X = tfidf_matrix
|
||
|
||
# Применение NMF
|
||
num_topics = 1000
|
||
nmf = NMF(n_components=num_topics, random_state=42)
|
||
nmf.fit(X)
|
||
|
||
# Вывод слов для каждой темы
|
||
feature_names = tfidf_vectorizer.get_feature_names()
|
||
with open('Тематическое_моделирование_NMF.csv', 'w', encoding="utf-8", newline='') as file:
|
||
writer = csv.writer(file)
|
||
for topic_idx, topic_words in enumerate(nmf.components_):
|
||
top_words_idx = topic_words.argsort()[-10:][::-1]
|
||
top_words = [feature_names[i] for i in top_words_idx]
|
||
writer.writerow(top_words)
|
||
|
||
|
||
# In[20]:
|
||
|
||
|
||
nmf_matrix = nmf.fit_transform(X)
|
||
|
||
|
||
# In[50]:
|
||
|
||
|
||
nmf_tr = nmf_matrix.T
|
||
|
||
|
||
# In[107]:
|
||
|
||
|
||
trans = nmf_tr.T
|
||
print(trans) # наша первоначальная матрица с уверенностью, что отзыв принадлежит теме
|
||
|
||
|
||
# In[37]:
|
||
|
||
|
||
tm = pd.DataFrame(nmf_matrix, columns=['Тема№'+f'{i}' for i in range(1,1001)])
|
||
tm.head(10)
|
||
|
||
|
||
# In[ ]:
|
||
|
||
|
||
|
||
|
||
|
||
# In[38]:
|
||
|
||
|
||
tm.insert(loc = 1000,
|
||
column = 'Ценности',
|
||
value = values)
|
||
|
||
|
||
# In[39]:
|
||
|
||
|
||
tm.head(10)
|
||
|
||
|
||
# In[71]:
|
||
|
||
|
||
q = tm[(tm['Ценности'] == 1)]
|
||
q
|
||
|
||
|
||
# In[72]:
|
||
|
||
|
||
q.shape
|
||
|
||
|
||
# In[78]:
|
||
|
||
|
||
name = list(tm.columns)
|
||
name.pop(-1)
|
||
w = q[name]
|
||
|
||
|
||
# In[83]:
|
||
|
||
|
||
w
|
||
|
||
|
||
# In[92]:
|
||
|
||
|
||
res = []
|
||
for column in w.columns:
|
||
|
||
# Storing the rows of a column
|
||
# into a temporary list
|
||
li = w[column].tolist()
|
||
|
||
# appending the temporary list
|
||
res.append(li)
|
||
print(len(res))
|
||
|
||
|
||
# In[91]:
|
||
|
||
|
||
t = [sum(nmf_tr[i]) for i in range(1000)] # суммы по темам по всем отзывам
|
||
print(t)
|
||
|
||
|
||
# In[94]:
|
||
|
||
|
||
sum_1 = [sum(res[i]) for i in range(1000)] # суммы по темам по релевантным отзывам
|
||
print(sum_1)
|
||
|
||
|
||
# In[152]:
|
||
|
||
|
||
import numpy as np
|
||
p = np.array(sum_1)/np.array(t)
|
||
print(p) # средняя вероятность по всем релевантным отзывам по темам, т.е. вероятность того, что тема соответствует культурной ценности
|
||
|
||
|
||
# In[237]:
|
||
|
||
|
||
pik = p[p > 0.9] # выберем те темы, где вероятность больше 0.9
|
||
print(pik)
|
||
|
||
|
||
# In[246]:
|
||
|
||
|
||
inde = []
|
||
|
||
for i in pik: #
|
||
inde.append(p.tolist().index(i))
|
||
print(inde)
|
||
|
||
|
||
# In[247]:
|
||
|
||
|
||
with open('Тематическое_моделирование_NMF.csv',encoding="utf-8") as fd:
|
||
reader=csv.reader(fd)
|
||
interestingrows=[row for idx, row in enumerate(reader) if idx in inde]
|
||
|
||
|
||
# In[249]:
|
||
|
||
|
||
for i in interestingrows:
|
||
print(i) #рассмотрим, что это за темы по ключевым словам
|
||
|
||
|
||
# In[173]:
|
||
|
||
|
||
ind = [i for i in range(1,1001)]
|
||
|
||
|
||
# In[181]:
|
||
|
||
|
||
from matplotlib.backends.backend_pdf import PdfPages
|
||
|
||
|
||
# In[191]:
|
||
|
||
|
||
|
||
fig = plt.figure(figsize=(50,20))
|
||
plt.xlabel('Номер Темы')
|
||
plt.ylabel('Вероятность соотношения темы с ценностями')
|
||
plt.bar(ind,p)
|
||
#plt.show()
|
||
pdf = PdfPages("Гистограмма.pdf")
|
||
pdf.savefig(fig)
|
||
|
||
|
||
# Сохранение файла
|
||
pdf.close()
|
||
|
||
|
||
# In[183]:
|
||
|
||
|
||
min_v = min(p)
|
||
max_v = max(p)
|
||
print(min_v, max_v)
|
||
|
||
|
||
# In[153]:
|
||
|
||
|
||
itog = []
|
||
for i in trans:
|
||
y = np.array(i)*np.array(p)
|
||
itog.append(y)
|
||
print(itog)
|
||
|
||
|
||
|
||
# In[154]:
|
||
|
||
|
||
np.array(itog).shape
|
||
|
||
|
||
# In[155]:
|
||
|
||
|
||
da = pd.DataFrame(np.array(itog), columns=['Тема№'+f'{i}' for i in range(1,1001)])
|
||
da.head(10) # итоговый вариант
|
||
|
||
|
||
# ### Кластеризация и категоризация текстовых данных
|
||
|
||
# In[42]:
|
||
|
||
|
||
from sklearn.cluster import KMeans
|
||
import numpy as np
|
||
|
||
import csv
|
||
with open('кластеры_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
|
||
writer = csv.writer(file)
|
||
writer.writerow(preprocessed_lemmatized_text)
|
||
|
||
# Применение кластеризации KMeans к матрице TF-IDF
|
||
num_clusters = 100
|
||
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
|
||
kmeans.fit(tfidf_matrix)
|
||
klaster = []
|
||
with open('кластеры_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
|
||
writer = csv.writer(file)
|
||
# Показать примеры документов в каждом кластере
|
||
for cluster_id in range(num_clusters):
|
||
cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
|
||
print(f"Кластер {cluster_id + 1}:")
|
||
for idx in cluster_indices:
|
||
print(documents[idx])
|
||
klaster.append(documents[idx])
|
||
writer.writerow(klaster)
|
||
klaster = []
|
||
print("--------")
|
||
|
||
|
||
# In[41]:
|
||
|
||
|
||
print(documents[0])
|
||
|
||
|
||
# ### Синонимы
|
||
|
||
# In[18]:
|
||
|
||
|
||
import bs4 as bs
|
||
import urllib.request
|
||
import re
|
||
import nltk
|
||
from nltk.corpus import stopwords
|
||
from gensim.models import Word2Vec
|
||
# Объединяем вcе отзывы в переменной article_text.
|
||
article_text = ""
|
||
for p in documents: article_text += p
|
||
# переводим все символы в нижний регистр.
|
||
cleaned_article = article_text.lower()
|
||
# Оставляем только буквы и убираем пробелы, используя регулярные выражения.
|
||
cleaned_article = re.sub('[^a-я]', ' ', cleaned_article)
|
||
cleaned_article = re.sub(r'\s+', ' ', cleaned_article)
|
||
# Готовим датасет для обучения
|
||
all_sentences = nltk.sent_tokenize(cleaned_article)
|
||
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
|
||
# Проходимся по датасету и удаляем стоп-слова
|
||
for i in range(len(all_words)):
|
||
all_words[i] = [w for w in all_words[i] if w not in stopwords.words('russian')]
|
||
#Создаем модель Word2Vec со словами, чаще всего встречающимися в тексте
|
||
word2vec = Word2Vec(all_words, min_count=3)
|
||
print(word2vec.wv.most_similar('справедливость', topn=5))
|
||
|
||
|
||
# In[23]:
|
||
|
||
|
||
print(word2vec.wv.most_similar('труд', topn=100))
|
||
|
||
|
||
# In[122]:
|
||
|
||
|
||
import csv
|
||
with open('словарь_слов_из_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
|
||
writer = csv.writer(file)
|
||
writer.writerow(all_words)
|
||
|